deepeval 3.4.7__py3-none-any.whl → 3.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. deepeval/__init__.py +8 -7
  2. deepeval/_version.py +1 -1
  3. deepeval/cli/dotenv_handler.py +71 -0
  4. deepeval/cli/main.py +1021 -280
  5. deepeval/cli/utils.py +116 -2
  6. deepeval/confident/api.py +29 -14
  7. deepeval/config/__init__.py +0 -0
  8. deepeval/config/settings.py +565 -0
  9. deepeval/config/settings_manager.py +133 -0
  10. deepeval/config/utils.py +86 -0
  11. deepeval/dataset/__init__.py +1 -0
  12. deepeval/dataset/dataset.py +70 -10
  13. deepeval/dataset/test_run_tracer.py +82 -0
  14. deepeval/dataset/utils.py +23 -0
  15. deepeval/key_handler.py +64 -2
  16. deepeval/metrics/__init__.py +4 -1
  17. deepeval/metrics/answer_relevancy/template.py +7 -2
  18. deepeval/metrics/conversational_dag/__init__.py +7 -0
  19. deepeval/metrics/conversational_dag/conversational_dag.py +139 -0
  20. deepeval/metrics/conversational_dag/nodes.py +931 -0
  21. deepeval/metrics/conversational_dag/templates.py +117 -0
  22. deepeval/metrics/dag/dag.py +13 -4
  23. deepeval/metrics/dag/graph.py +47 -15
  24. deepeval/metrics/dag/utils.py +103 -38
  25. deepeval/metrics/faithfulness/template.py +11 -8
  26. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
  27. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
  28. deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
  29. deepeval/models/llms/amazon_bedrock_model.py +24 -3
  30. deepeval/models/llms/openai_model.py +37 -41
  31. deepeval/models/retry_policy.py +280 -0
  32. deepeval/openai_agents/agent.py +4 -2
  33. deepeval/synthesizer/chunking/doc_chunker.py +87 -51
  34. deepeval/test_run/api.py +1 -0
  35. deepeval/tracing/otel/exporter.py +20 -8
  36. deepeval/tracing/otel/utils.py +57 -0
  37. deepeval/tracing/tracing.py +37 -16
  38. deepeval/tracing/utils.py +98 -1
  39. deepeval/utils.py +111 -70
  40. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
  41. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/RECORD +44 -34
  42. deepeval/env.py +0 -35
  43. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
  44. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
  45. {deepeval-3.4.7.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,931 @@
1
+ from typing import Optional, List, Union, Literal, Tuple
2
+ from dataclasses import dataclass
3
+ from pydantic import create_model
4
+ import asyncio
5
+
6
+ from deepeval.metrics.base_metric import BaseConversationalMetric
7
+ from deepeval.metrics.conversational_g_eval.conversational_g_eval import (
8
+ ConversationalGEval,
9
+ )
10
+ from deepeval.metrics.g_eval.utils import CONVERSATIONAL_G_EVAL_PARAMS
11
+ from deepeval.metrics.utils import copy_metrics, trimAndLoadJson
12
+ from deepeval.test_case import (
13
+ ConversationalTestCase,
14
+ TurnParams,
15
+ ToolCall,
16
+ Turn,
17
+ )
18
+ from deepeval.utils import prettify_list
19
+
20
+ from .templates import (
21
+ ConversationalBinaryJudgementTemplate,
22
+ ConversationalNonBinaryJudgementTemplate,
23
+ ConversationalTaskNodeTemplate,
24
+ ConversationalVerdictNodeTemplate,
25
+ )
26
+ from deepeval.metrics.dag.schema import (
27
+ BinaryJudgementVerdict,
28
+ MetricScoreReason,
29
+ NonBinaryJudgementVerdict,
30
+ TaskNodeOutput,
31
+ )
32
+
33
+
34
+ class ConversationalBaseNode:
35
+ _indegree: int = 0
36
+ _depth: int = 0
37
+
38
+ def set_parent(self, parent: "ConversationalBaseNode"):
39
+ if hasattr(self, "_parent"):
40
+ self._parent = parent
41
+ elif hasattr(self, "_parents"):
42
+ if self._parents is None:
43
+ self._parents = []
44
+ self._parents.append(parent)
45
+
46
+ def _execute(
47
+ self,
48
+ metric: BaseConversationalMetric,
49
+ test_case: ConversationalTestCase,
50
+ depth: int,
51
+ ):
52
+ raise NotImplementedError(
53
+ "This node type must implement the _execute method."
54
+ )
55
+
56
+ async def _a_execute(
57
+ self,
58
+ metric: BaseConversationalMetric,
59
+ test_case: ConversationalTestCase,
60
+ depth: int,
61
+ ):
62
+ raise NotImplementedError(
63
+ "This node type must implement the _a_execute method."
64
+ )
65
+
66
+
67
+ def increment_indegree(node: ConversationalBaseNode):
68
+ node._indegree += 1
69
+
70
+
71
+ def decrement_indegree(node: ConversationalBaseNode):
72
+ node._indegree -= 1
73
+
74
+
75
+ @dataclass
76
+ class ConversationalVerdictNode(ConversationalBaseNode):
77
+ verdict: Union[str, bool]
78
+ score: Optional[int] = None
79
+ child: Optional[
80
+ Union[
81
+ ConversationalBaseNode,
82
+ ConversationalGEval,
83
+ BaseConversationalMetric,
84
+ ]
85
+ ] = None
86
+ _parent: Optional[ConversationalBaseNode] = None
87
+
88
+ def __hash__(self):
89
+ return id(self)
90
+
91
+ def __post_init__(self):
92
+ # Ensure either `score` or `child` is set, but not both
93
+ if self.score is not None and self.child is not None:
94
+ raise ValueError(
95
+ "A ConversationalVerdictNode can have either a 'score' or a 'child', but not both."
96
+ )
97
+ if self.score is None and self.child is None:
98
+ raise ValueError(
99
+ "A ConversationalVerdictNode must have either a 'score' or a 'child'."
100
+ )
101
+
102
+ if self.score is not None:
103
+ if not (0 <= self.score <= 10):
104
+ raise ValueError(
105
+ "The score must be between 0 and 10, inclusive."
106
+ )
107
+
108
+ def _execute(
109
+ self,
110
+ metric: BaseConversationalMetric,
111
+ test_case: ConversationalTestCase,
112
+ depth: int,
113
+ ):
114
+ decrement_indegree(self)
115
+ if self._indegree > 0:
116
+ return
117
+
118
+ if isinstance(
119
+ self._parent, ConversationalNonBinaryJudgementNode
120
+ ) or isinstance(self._parent, ConversationalBinaryJudgementNode):
121
+ if self._parent._verdict.verdict != self.verdict:
122
+ return
123
+
124
+ if self.child is not None:
125
+ if isinstance(self.child, ConversationalGEval):
126
+ convo_g_eval_args = {
127
+ "name": self.child.name,
128
+ "model": metric.model,
129
+ "verbose_mode": False,
130
+ }
131
+ if self.child.criteria:
132
+ convo_g_eval_args["criteria"] = self.child.criteria
133
+ else:
134
+ convo_g_eval_args["evaluation_steps"] = (
135
+ self.child.evaluation_steps
136
+ )
137
+ if self.child.evaluation_params:
138
+ convo_g_eval_args["evaluation_params"] = (
139
+ self.child.evaluation_params
140
+ )
141
+ copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
142
+
143
+ copied_convo_g_eval.measure(
144
+ test_case=test_case, _show_indicator=False
145
+ )
146
+ metric._verbose_steps.append(
147
+ construct_node_verbose_log(self, depth, copied_convo_g_eval)
148
+ )
149
+ metric.score = copied_convo_g_eval.score
150
+ if metric.include_reason:
151
+ metric.reason = copied_convo_g_eval.reason
152
+
153
+ elif isinstance(self.child, BaseConversationalMetric):
154
+ copied_metric: BaseConversationalMetric = copy_metrics(
155
+ [self.child]
156
+ )[0]
157
+ copied_metric.verbose_mode = False
158
+
159
+ copied_metric.measure(
160
+ test_case=test_case, _show_indicator=False
161
+ )
162
+ metric._verbose_steps.append(
163
+ construct_node_verbose_log(self, depth, copied_metric)
164
+ )
165
+ metric.score = copied_metric.score
166
+ if metric.include_reason:
167
+ metric.reason = copied_metric.reason
168
+ else:
169
+ self.child._execute(
170
+ metric=metric, test_case=test_case, depth=depth
171
+ )
172
+ else:
173
+ metric._verbose_steps.append(
174
+ construct_node_verbose_log(self, depth)
175
+ )
176
+ metric.score = self.score / 10
177
+ if metric.include_reason:
178
+ metric.reason = self._generate_reason(metric=metric)
179
+
180
+ async def _a_execute(
181
+ self,
182
+ metric: BaseConversationalMetric,
183
+ test_case: ConversationalTestCase,
184
+ depth: int,
185
+ ):
186
+ decrement_indegree(self)
187
+ if self._indegree > 0:
188
+ return
189
+
190
+ if isinstance(
191
+ self._parent, ConversationalNonBinaryJudgementNode
192
+ ) or isinstance(self._parent, ConversationalBinaryJudgementNode):
193
+ if self._parent._verdict.verdict != self.verdict:
194
+ return
195
+
196
+ if self.child is not None:
197
+ if isinstance(self.child, ConversationalGEval):
198
+ convo_g_eval_args = {
199
+ "name": self.child.name,
200
+ "model": metric.model,
201
+ "verbose_mode": False,
202
+ }
203
+ if self.child.criteria:
204
+ convo_g_eval_args["criteria"] = self.child.criteria
205
+ else:
206
+ convo_g_eval_args["evaluation_steps"] = (
207
+ self.child.evaluation_steps
208
+ )
209
+ if self.child.evaluation_params:
210
+ convo_g_eval_args["evaluation_params"] = (
211
+ self.child.evaluation_params
212
+ )
213
+ copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
214
+
215
+ await copied_convo_g_eval.a_measure(
216
+ test_case=test_case, _show_indicator=False
217
+ )
218
+ metric._verbose_steps.append(
219
+ construct_node_verbose_log(self, depth, copied_convo_g_eval)
220
+ )
221
+ metric.score = copied_convo_g_eval.score
222
+ if metric.include_reason:
223
+ metric.reason = copied_convo_g_eval.reason
224
+
225
+ elif isinstance(self.child, BaseConversationalMetric):
226
+ copied_metric: BaseConversationalMetric = copy_metrics(
227
+ [self.child]
228
+ )[0]
229
+ copied_metric.verbose_mode = False
230
+
231
+ await copied_metric.a_measure(
232
+ test_case=test_case, _show_indicator=False
233
+ )
234
+ metric._verbose_steps.append(
235
+ construct_node_verbose_log(self, depth, copied_metric)
236
+ )
237
+ metric.score = copied_metric.score
238
+ if metric.include_reason:
239
+ metric.reason = copied_metric.reason
240
+ else:
241
+ await self.child._a_execute(
242
+ metric=metric, test_case=test_case, depth=depth
243
+ )
244
+ else:
245
+ metric._verbose_steps.append(
246
+ construct_node_verbose_log(self, depth)
247
+ )
248
+ metric.score = self.score / 10
249
+ if metric.include_reason:
250
+ metric.reason = await self._a_generate_reason(metric=metric)
251
+
252
+ def _generate_reason(self, metric: BaseConversationalMetric):
253
+ prompt = ConversationalVerdictNodeTemplate.generate_reason(
254
+ verbose_steps=metric._verbose_steps,
255
+ score=metric.score,
256
+ name=metric.__name__,
257
+ )
258
+ if metric.using_native_model:
259
+ res, cost = metric.model.generate(prompt, schema=MetricScoreReason)
260
+ metric.evaluation_cost += cost
261
+ else:
262
+ try:
263
+ res: MetricScoreReason = metric.model.generate(
264
+ prompt, schema=MetricScoreReason
265
+ )
266
+ except TypeError:
267
+ res = metric.model.generate(prompt)
268
+ data = trimAndLoadJson(res, self)
269
+ res = MetricScoreReason(**data)
270
+
271
+ return res.reason
272
+
273
+ async def _a_generate_reason(self, metric: BaseConversationalMetric):
274
+ prompt = ConversationalVerdictNodeTemplate.generate_reason(
275
+ verbose_steps=metric._verbose_steps,
276
+ score=metric.score,
277
+ name=metric.__name__,
278
+ )
279
+ if metric.using_native_model:
280
+ res, cost = await metric.model.a_generate(
281
+ prompt, schema=MetricScoreReason
282
+ )
283
+ metric.evaluation_cost += cost
284
+ else:
285
+ try:
286
+ res: MetricScoreReason = await metric.model.a_generate(
287
+ prompt, schema=MetricScoreReason
288
+ )
289
+ except TypeError:
290
+ res = await metric.model.a_generate(prompt)
291
+ data = trimAndLoadJson(res, self)
292
+ res = MetricScoreReason(**data)
293
+
294
+ return res.reason
295
+
296
+
297
+ @dataclass
298
+ class ConversationalTaskNode(ConversationalBaseNode):
299
+ instructions: str
300
+ output_label: str
301
+ children: List[ConversationalBaseNode]
302
+ evaluation_params: List[TurnParams] = None
303
+ turn_window: Tuple[int, int] = None
304
+ label: Optional[str] = None
305
+ _verbose_logs: Optional[str] = None
306
+ _output: Optional[str] = None
307
+ _parents: Optional[List[ConversationalBaseNode]] = None
308
+
309
+ def __hash__(self):
310
+ return id(self)
311
+
312
+ def __post_init__(self):
313
+ for child in self.children:
314
+ if isinstance(child, ConversationalVerdictNode):
315
+ raise ValueError(
316
+ "A ConversationalTaskNode must not have a ConversationalVerdictNode as one of their 'children'."
317
+ )
318
+
319
+ for child in self.children:
320
+ child.set_parent(self)
321
+ increment_indegree(child)
322
+
323
+ def _execute(
324
+ self,
325
+ metric: BaseConversationalMetric,
326
+ test_case: ConversationalTestCase,
327
+ depth: int,
328
+ ):
329
+ self._depth = max(0, self._depth, depth)
330
+ decrement_indegree(self)
331
+ if self._indegree > 0:
332
+ return
333
+
334
+ if self.evaluation_params is None and self._parents is None:
335
+ raise ValueError(
336
+ "A ConversationalTaskNode must have either a 'evaluation_params' or parent node(s)."
337
+ )
338
+
339
+ if self.turn_window is not None:
340
+ is_valid_turn_window(self.turn_window, test_case.turns)
341
+
342
+ if not self.turn_window:
343
+ self.turn_window = 0, len(test_case.turns) - 1
344
+
345
+ text = """"""
346
+ start, end = self.turn_window
347
+ if self._parents is not None:
348
+ for parent in self._parents:
349
+ if isinstance(parent, ConversationalTaskNode):
350
+ text += f"{parent.output_label}:\n{parent._output}\n\n"
351
+
352
+ if self.evaluation_params is not None:
353
+ text += "Full Conversation: \n"
354
+ for index in range(start, end + 1):
355
+ turn = test_case.turns[index]
356
+ for param in self.evaluation_params:
357
+ value = getattr(turn, param.value)
358
+ if isinstance(value, ToolCall):
359
+ value = repr(value)
360
+ text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
361
+ text += "\n"
362
+
363
+ prompt = ConversationalTaskNodeTemplate.generate_task_output(
364
+ instructions=self.instructions,
365
+ text=text,
366
+ )
367
+ if metric.using_native_model:
368
+ res, cost = metric.model.generate(prompt, schema=TaskNodeOutput)
369
+ metric.evaluation_cost += cost
370
+ self._output = res.output
371
+ else:
372
+ try:
373
+ res: TaskNodeOutput = metric.model.generate(
374
+ prompt, schema=TaskNodeOutput
375
+ )
376
+ self._output = res.output
377
+ except TypeError:
378
+ res = metric.model.generate(prompt)
379
+ data = trimAndLoadJson(res, self)
380
+ self._output = TaskNodeOutput(**data).output
381
+
382
+ metric._verbose_steps.append(
383
+ construct_node_verbose_log(self, self._depth)
384
+ )
385
+ for children in self.children:
386
+ children._execute(
387
+ metric=metric, test_case=test_case, depth=self._depth + 1
388
+ )
389
+
390
+ async def _a_execute(
391
+ self,
392
+ metric: BaseConversationalMetric,
393
+ test_case: ConversationalTestCase,
394
+ depth: int,
395
+ ):
396
+ self._depth = max(0, self._depth, depth)
397
+ decrement_indegree(self)
398
+ if self._indegree > 0:
399
+ return
400
+
401
+ if self.evaluation_params is None and self._parents is None:
402
+ raise ValueError(
403
+ "A ConversationalTaskNode must have either a 'evaluation_params' or parent node(s)."
404
+ )
405
+
406
+ if self.turn_window is not None:
407
+ is_valid_turn_window(self.turn_window, test_case.turns)
408
+
409
+ if not self.turn_window:
410
+ self.turn_window = 0, len(test_case.turns) - 1
411
+
412
+ text = """"""
413
+ start, end = self.turn_window
414
+ if self._parents is not None:
415
+ for parent in self._parents:
416
+ if isinstance(parent, ConversationalTaskNode):
417
+ text += f"{parent.output_label}:\n{parent._output}\n\n"
418
+
419
+ if self.evaluation_params is not None:
420
+ text += "Full Conversation: \n"
421
+ for index in range(start, end + 1):
422
+ turn = test_case.turns[index]
423
+ for param in self.evaluation_params:
424
+ value = getattr(turn, param.value)
425
+ if isinstance(value, ToolCall):
426
+ value = repr(value)
427
+ text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
428
+ text += "\n"
429
+
430
+ prompt = ConversationalTaskNodeTemplate.generate_task_output(
431
+ instructions=self.instructions,
432
+ text=text,
433
+ )
434
+ if metric.using_native_model:
435
+ res, cost = await metric.model.a_generate(
436
+ prompt, schema=TaskNodeOutput
437
+ )
438
+ metric.evaluation_cost += cost
439
+ self._output = res.output
440
+ else:
441
+ try:
442
+ res: TaskNodeOutput = await metric.model.a_generate(
443
+ prompt, schema=TaskNodeOutput
444
+ )
445
+ self._output = res.output
446
+ except TypeError:
447
+ res = await metric.model.a_generate(prompt)
448
+ data = trimAndLoadJson(res, self)
449
+ self._output = TaskNodeOutput(**data).output
450
+
451
+ metric._verbose_steps.append(
452
+ construct_node_verbose_log(self, self._depth)
453
+ )
454
+ await asyncio.gather(
455
+ *(
456
+ child._a_execute(
457
+ metric=metric, test_case=test_case, depth=self._depth + 1
458
+ )
459
+ for child in self.children
460
+ )
461
+ )
462
+
463
+
464
+ @dataclass
465
+ class ConversationalBinaryJudgementNode(ConversationalBaseNode):
466
+ criteria: str
467
+ children: List[ConversationalVerdictNode]
468
+ evaluation_params: Optional[List[TurnParams]] = None
469
+ turn_window: Tuple[int, int] = None
470
+ label: Optional[str] = None
471
+ _verbose_logs: Optional[str] = None
472
+ _verdict: Optional[BinaryJudgementVerdict] = None
473
+ _parents: Optional[List[ConversationalBaseNode]] = None
474
+
475
+ def __hash__(self):
476
+ return id(self)
477
+
478
+ def __post_init__(self):
479
+ if len(self.children) != 2:
480
+ raise ValueError(
481
+ "ConversationalBinaryJudgementNode must have exactly 2 children."
482
+ )
483
+
484
+ # Check if all children are ClassificationResultNode and their classifications are boolean
485
+ for child in self.children:
486
+ if not isinstance(child, ConversationalVerdictNode):
487
+ raise TypeError(
488
+ "All children of ConversationalBinaryJudgementNode must be of type ConversationalVerdictNode."
489
+ )
490
+
491
+ if not isinstance(child.verdict, bool):
492
+ raise ValueError(
493
+ "All children of ConversationalBinaryJudgementNode must have a boolean verdict."
494
+ )
495
+
496
+ # Check if there is one True and one False classification
497
+ verdicts = [child.verdict for child in self.children]
498
+ if verdicts.count(True) != 1 or verdicts.count(False) != 1:
499
+ raise ValueError(
500
+ "ConversationalBinaryJudgementNode must have one True and one False ConversationalVerdictNode child."
501
+ )
502
+
503
+ # print("-------")
504
+ for child in self.children:
505
+ child.set_parent(self)
506
+ increment_indegree(child)
507
+ if child.child is not None and isinstance(
508
+ child.child, ConversationalBaseNode
509
+ ):
510
+ increment_indegree(child.child)
511
+ # print("binary node nested", child.child.__class__.__name__, id(child.child), child.child._indegree)
512
+ # print("binary node", child.__class__.__name__, id(child), child._indegree)
513
+ # print("-------")
514
+
515
+ def _execute(
516
+ self,
517
+ metric: BaseConversationalMetric,
518
+ test_case: ConversationalTestCase,
519
+ depth: int,
520
+ ):
521
+ self._depth = max(0, self._depth, depth)
522
+ decrement_indegree(self)
523
+ if self._indegree > 0:
524
+ return
525
+
526
+ if self.turn_window is not None:
527
+ is_valid_turn_window(self.turn_window, test_case.turns)
528
+
529
+ if not self.turn_window:
530
+ self.turn_window = 0, len(test_case.turns) - 1
531
+
532
+ text = """"""
533
+ start, end = self.turn_window
534
+ if self._parents is not None:
535
+ for parent in self._parents:
536
+ if isinstance(parent, ConversationalTaskNode):
537
+ text += f"{parent.output_label}:\n{parent._output}\n\n"
538
+
539
+ if self.evaluation_params is not None:
540
+ text += "Full Conversation: \n"
541
+ for index in range(start, end + 1):
542
+ turn = test_case.turns[index]
543
+ for param in self.evaluation_params:
544
+ value = getattr(turn, param.value)
545
+ if isinstance(value, ToolCall):
546
+ value = repr(value)
547
+ text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
548
+ text += "\n"
549
+
550
+ prompt = ConversationalBinaryJudgementTemplate.generate_binary_verdict(
551
+ criteria=self.criteria,
552
+ text=text,
553
+ )
554
+ if metric.using_native_model:
555
+ res, cost = metric.model.generate(
556
+ prompt, schema=BinaryJudgementVerdict
557
+ )
558
+ metric.evaluation_cost += cost
559
+ self._verdict = res
560
+ else:
561
+ try:
562
+ res: BinaryJudgementVerdict = metric.model.generate(
563
+ prompt, schema=BinaryJudgementVerdict
564
+ )
565
+ self._verdict = res
566
+ except TypeError:
567
+ res = metric.model.generate(prompt)
568
+ data = trimAndLoadJson(res, self)
569
+ self._verdict = BinaryJudgementVerdict(**data)
570
+
571
+ metric._verbose_steps.append(
572
+ construct_node_verbose_log(self, self._depth)
573
+ )
574
+ for children in self.children:
575
+ children._execute(
576
+ metric=metric, test_case=test_case, depth=self._depth + 1
577
+ )
578
+
579
+ async def _a_execute(
580
+ self,
581
+ metric: BaseConversationalMetric,
582
+ test_case: ConversationalTestCase,
583
+ depth: int,
584
+ ):
585
+ self._depth = max(0, self._depth, depth)
586
+ decrement_indegree(self)
587
+ if self._indegree > 0:
588
+ return
589
+
590
+ if self.turn_window is not None:
591
+ is_valid_turn_window(self.turn_window, test_case.turns)
592
+
593
+ if not self.turn_window:
594
+ self.turn_window = 0, len(test_case.turns) - 1
595
+
596
+ text = """"""
597
+ start, end = self.turn_window
598
+ if self._parents is not None:
599
+ for parent in self._parents:
600
+ if isinstance(parent, ConversationalTaskNode):
601
+ text += f"{parent.output_label}:\n{parent._output}\n\n"
602
+
603
+ if self.evaluation_params is not None:
604
+ text += "Full Conversation: \n"
605
+ for index in range(start, end + 1):
606
+ turn = test_case.turns[index]
607
+ for param in self.evaluation_params:
608
+ value = getattr(turn, param.value)
609
+ if isinstance(value, ToolCall):
610
+ value = repr(value)
611
+ text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
612
+ text += "\n"
613
+
614
+ prompt = ConversationalBinaryJudgementTemplate.generate_binary_verdict(
615
+ criteria=self.criteria,
616
+ text=text,
617
+ )
618
+ if metric.using_native_model:
619
+ res, cost = await metric.model.a_generate(
620
+ prompt, schema=BinaryJudgementVerdict
621
+ )
622
+ metric.evaluation_cost += cost
623
+ self._verdict = res
624
+ else:
625
+ try:
626
+ res: BinaryJudgementVerdict = await metric.model.a_generate(
627
+ prompt, schema=BinaryJudgementVerdict
628
+ )
629
+ self._verdict = res
630
+ except TypeError:
631
+ res = await metric.model.a_generate(prompt)
632
+ data = trimAndLoadJson(res, self)
633
+ self._verdict = BinaryJudgementVerdict(**data)
634
+
635
+ metric._verbose_steps.append(
636
+ construct_node_verbose_log(self, self._depth)
637
+ )
638
+ await asyncio.gather(
639
+ *(
640
+ child._a_execute(
641
+ metric=metric, test_case=test_case, depth=self._depth + 1
642
+ )
643
+ for child in self.children
644
+ )
645
+ )
646
+
647
+
648
+ @dataclass
649
+ class ConversationalNonBinaryJudgementNode(ConversationalBaseNode):
650
+ criteria: str
651
+ children: List[ConversationalVerdictNode]
652
+ evaluation_params: Optional[List[TurnParams]] = None
653
+ turn_window: Tuple[int, int] = None
654
+ label: Optional[str] = None
655
+ _verbose_logs: Optional[str] = None
656
+ _verdict: Optional[NonBinaryJudgementVerdict] = None
657
+ _parents: Optional[List[ConversationalBaseNode]] = None
658
+
659
+ def __hash__(self):
660
+ return id(self)
661
+
662
+ def __post_init__(self):
663
+ # Check if children is not empty
664
+ if not self.children:
665
+ raise ValueError(
666
+ "ConversationalNonBinaryJudgementNode must have at least one child."
667
+ )
668
+
669
+ verdicts_set = set()
670
+ for child in self.children:
671
+ if not isinstance(child, ConversationalVerdictNode):
672
+ raise TypeError(
673
+ "All children must be of type ConversationalVerdictNode."
674
+ )
675
+
676
+ # Check if the verdict attribute of each child is a string
677
+ if not isinstance(child.verdict, str):
678
+ raise ValueError(
679
+ "The verdict attribute of all children must be a string."
680
+ )
681
+
682
+ # Check for duplicate verdicts
683
+ if child.verdict in verdicts_set:
684
+ raise ValueError(
685
+ f"Duplicate verdict found: {child.verdict} in children of ConversationalNonBinaryJudgementNode."
686
+ )
687
+ verdicts_set.add(child.verdict)
688
+
689
+ self._verdict_options = list(verdicts_set)
690
+
691
+ # Dynamically create ConversationalNonBinaryJudgementNode class
692
+ self._verdict_schema = create_model(
693
+ "ConversationalNonBinaryJudgementNode",
694
+ verdict=(Literal[tuple(self._verdict_options)], ...),
695
+ reason=(str, ...),
696
+ )
697
+
698
+ # print("-------")
699
+ for child in self.children:
700
+ child.set_parent(self)
701
+ increment_indegree(child)
702
+ if child.child is not None and isinstance(
703
+ child.child, ConversationalBaseNode
704
+ ):
705
+ increment_indegree(child.child)
706
+ # print("non binary node nested", child.child.__class__.__name__, id(child.child), child.child._indegree)
707
+ # print("non binary node", child.__class__.__name__, id(child), child._indegree)
708
+ # print("-------")
709
+
710
+ def _execute(
711
+ self,
712
+ metric: BaseConversationalMetric,
713
+ test_case: ConversationalTestCase,
714
+ depth: int,
715
+ ):
716
+ self._depth = max(0, self._depth, depth)
717
+ decrement_indegree(self)
718
+ if self._indegree > 0:
719
+ return
720
+
721
+ if self.turn_window is not None:
722
+ is_valid_turn_window(self.turn_window, test_case.turns)
723
+
724
+ if not self.turn_window:
725
+ self.turn_window = 0, len(test_case.turns) - 1
726
+
727
+ text = """"""
728
+ start, end = self.turn_window
729
+ if self._parents is not None:
730
+ for parent in self._parents:
731
+ if isinstance(parent, ConversationalTaskNode):
732
+ text += f"{parent.output_label}:\n{parent._output}\n\n"
733
+
734
+ if self.evaluation_params is not None:
735
+ text += "Full Conversation: \n"
736
+ for index in range(start, end + 1):
737
+ turn = test_case.turns[index]
738
+ for param in self.evaluation_params:
739
+ value = getattr(turn, param.value)
740
+ if isinstance(value, ToolCall):
741
+ value = repr(value)
742
+ text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
743
+ text += "\n"
744
+
745
+ prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
746
+ criteria=self.criteria, text=text, options=self._verdict_options
747
+ )
748
+ if metric.using_native_model:
749
+ res, cost = metric.model.generate(
750
+ prompt, schema=self._verdict_schema
751
+ )
752
+ metric.evaluation_cost += cost
753
+ self._verdict = res
754
+ else:
755
+ try:
756
+ res: self._verdict_schema = metric.model.generate(
757
+ prompt, schema=self._verdict_schema
758
+ )
759
+ self._verdict = res
760
+ except TypeError:
761
+ res = metric.model.generate(prompt)
762
+ data = trimAndLoadJson(res, self)
763
+ self._verdict = self._verdict_schema(**data)
764
+
765
+ metric._verbose_steps.append(
766
+ construct_node_verbose_log(self, self._depth)
767
+ )
768
+ for children in self.children:
769
+ children._execute(
770
+ metric=metric, test_case=test_case, depth=self._depth + 1
771
+ )
772
+
773
+ async def _a_execute(
774
+ self,
775
+ metric: BaseConversationalMetric,
776
+ test_case: ConversationalTestCase,
777
+ depth: int,
778
+ ):
779
+ self._depth = max(0, self._depth, depth)
780
+ decrement_indegree(self)
781
+ if self._indegree > 0:
782
+ return
783
+
784
+ if self.turn_window is not None:
785
+ is_valid_turn_window(self.turn_window, test_case.turns)
786
+
787
+ if not self.turn_window:
788
+ self.turn_window = 0, len(test_case.turns) - 1
789
+
790
+ text = """"""
791
+ start, end = self.turn_window
792
+ if self._parents is not None:
793
+ for parent in self._parents:
794
+ if isinstance(parent, ConversationalTaskNode):
795
+ text += f"{parent.output_label}:\n{parent._output}\n\n"
796
+
797
+ if self.evaluation_params is not None:
798
+ text += "Full Conversation: \n"
799
+ for index in range(start, end + 1):
800
+ turn = test_case.turns[index]
801
+ for param in self.evaluation_params:
802
+ value = getattr(turn, param.value)
803
+ if isinstance(value, ToolCall):
804
+ value = repr(value)
805
+ text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value}\n"
806
+ text += "\n"
807
+
808
+ prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
809
+ criteria=self.criteria, text=text, options=self._verdict_options
810
+ )
811
+ if metric.using_native_model:
812
+ res, cost = await metric.model.a_generate(
813
+ prompt, schema=self._verdict_schema
814
+ )
815
+ metric.evaluation_cost += cost
816
+ self._verdict = res
817
+ else:
818
+ try:
819
+ res: self._verdict_schema = await metric.model.a_generate(
820
+ prompt, schema=self._verdict_schema
821
+ )
822
+ self._verdict = res
823
+ except TypeError:
824
+ res = await metric.model.a_generate(prompt)
825
+ data = trimAndLoadJson(res, self)
826
+ self._verdict = self._verdict_schema(**data)
827
+
828
+ metric._verbose_steps.append(
829
+ construct_node_verbose_log(self, self._depth)
830
+ )
831
+ await asyncio.gather(
832
+ *(
833
+ child._a_execute(
834
+ metric=metric, test_case=test_case, depth=self._depth + 1
835
+ )
836
+ for child in self.children
837
+ )
838
+ )
839
+
840
+
841
+ def construct_node_verbose_log(
842
+ node: ConversationalBaseNode,
843
+ depth: int,
844
+ node_metric: Optional[
845
+ Union[ConversationalGEval, BaseConversationalMetric]
846
+ ] = None,
847
+ ) -> str:
848
+ if (
849
+ isinstance(node, ConversationalBinaryJudgementNode)
850
+ or isinstance(node, ConversationalNonBinaryJudgementNode)
851
+ or isinstance(node, ConversationalTaskNode)
852
+ ):
853
+ label = node.label if node.label else "None"
854
+
855
+ if isinstance(node, ConversationalBinaryJudgementNode) or isinstance(
856
+ node, ConversationalNonBinaryJudgementNode
857
+ ):
858
+ is_binary_node = isinstance(node, ConversationalBinaryJudgementNode)
859
+ node_type = (
860
+ "ConversationalBinaryJudgementNode"
861
+ if is_binary_node
862
+ else "ConversationalNonBinaryJudgementNode"
863
+ )
864
+ underscore_multiple = 34 if is_binary_node else 37
865
+ star_multiple = 48 if is_binary_node else 53
866
+ return (
867
+ f"{'_' * underscore_multiple}\n"
868
+ f"| {node_type} | Level == {depth} |\n"
869
+ f"{'*' * star_multiple}\n"
870
+ f"Label: {label}\n\n"
871
+ "Criteria:\n"
872
+ f"{node.criteria}\n\n"
873
+ f"Verdict: {node._verdict.verdict}\n"
874
+ f"Reason: {node._verdict.reason}\n"
875
+ )
876
+ elif isinstance(node, ConversationalTaskNode):
877
+ return (
878
+ "______________________________________________\n"
879
+ f"| ConversationalTaskNode | Level == {depth} |\n"
880
+ "**********************************************\n"
881
+ f"Label: {label}\n\n"
882
+ "Instructions:\n"
883
+ f"{node.instructions}\n\n"
884
+ f"{node.output_label}:\n{node._output}\n"
885
+ )
886
+ elif isinstance(node, ConversationalVerdictNode):
887
+ type = None
888
+ if node_metric:
889
+ if isinstance(node_metric, ConversationalGEval) or isinstance(
890
+ node_metric, BaseConversationalMetric
891
+ ):
892
+ type = f"{node_metric.__name__} Metric"
893
+ else:
894
+ type = "Deterministic"
895
+
896
+ verbose_log = (
897
+ "_________________________________________________\n"
898
+ f"| ConversationalVerdictNode | Level == {depth} |\n"
899
+ "*************************************************\n"
900
+ f"Verdict: {node.verdict}\n"
901
+ f"Type: {type}"
902
+ )
903
+ if isinstance(node_metric, ConversationalGEval):
904
+ verbose_log += f"\n\nCriteria:\n{node_metric.criteria}\n"
905
+ verbose_log += f"Evaluation Steps:\n{prettify_list(node_metric.evaluation_steps)}"
906
+ elif isinstance(node_metric, BaseConversationalMetric):
907
+ verbose_log += f"\n\n{node_metric.verbose_logs}"
908
+
909
+ return verbose_log
910
+
911
+
912
+ def is_valid_turn_window(
913
+ turn_window: Tuple[int, int], turns: List[Turn]
914
+ ) -> bool:
915
+ if len(turn_window) != 2:
916
+ raise ValueError(
917
+ "A 'turn_window' must have only 2 indices representing start and end"
918
+ )
919
+ start, end = turn_window
920
+ if (
921
+ start > end
922
+ or start == end
923
+ or (end - start) >= len(turns)
924
+ or start < 0
925
+ or end < 0
926
+ or end == len(turns)
927
+ ):
928
+ raise ValueError(
929
+ "The 'turn_window' passed is invalid. Please recheck your 'turn_window' values."
930
+ )
931
+ return True