deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +725 -217
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/test_run/test_run.py
CHANGED
|
@@ -35,12 +35,10 @@ from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
|
|
|
35
35
|
from deepeval.prompt import (
|
|
36
36
|
PromptMessage,
|
|
37
37
|
ModelSettings,
|
|
38
|
-
OutputType,
|
|
39
38
|
PromptInterpolationType,
|
|
40
39
|
OutputType,
|
|
41
40
|
)
|
|
42
41
|
from rich.panel import Panel
|
|
43
|
-
from rich.text import Text
|
|
44
42
|
from rich.columns import Columns
|
|
45
43
|
|
|
46
44
|
|
|
@@ -213,65 +211,91 @@ class TestRun(BaseModel):
|
|
|
213
211
|
valid_scores = 0
|
|
214
212
|
|
|
215
213
|
def process_metric_data(metric_data: MetricData):
|
|
214
|
+
"""
|
|
215
|
+
Process and aggregate metric data for overall test metrics.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
metric_data: The metric data to process
|
|
219
|
+
"""
|
|
216
220
|
nonlocal valid_scores
|
|
217
|
-
|
|
221
|
+
metric_name = metric_data.name
|
|
218
222
|
score = metric_data.score
|
|
219
223
|
success = metric_data.success
|
|
220
|
-
|
|
221
|
-
if
|
|
222
|
-
metrics_dict[
|
|
224
|
+
|
|
225
|
+
if metric_name not in metrics_dict:
|
|
226
|
+
metrics_dict[metric_name] = {
|
|
223
227
|
"scores": [],
|
|
224
228
|
"passes": 0,
|
|
225
229
|
"fails": 0,
|
|
226
230
|
"errors": 0,
|
|
227
231
|
}
|
|
228
232
|
|
|
233
|
+
metric_dict = metrics_dict[metric_name]
|
|
234
|
+
|
|
229
235
|
if score is None or success is None:
|
|
230
|
-
|
|
236
|
+
metric_dict["errors"] += 1
|
|
231
237
|
else:
|
|
232
238
|
valid_scores += 1
|
|
233
|
-
|
|
234
|
-
# Append the score.
|
|
235
|
-
metrics_dict[name]["scores"].append(score)
|
|
236
|
-
|
|
237
|
-
# Increment passes or fails based on the metric_data.success flag.
|
|
239
|
+
metric_dict["scores"].append(score)
|
|
238
240
|
if success:
|
|
239
|
-
|
|
241
|
+
metric_dict["passes"] += 1
|
|
240
242
|
else:
|
|
241
|
-
|
|
243
|
+
metric_dict["fails"] += 1
|
|
242
244
|
|
|
243
245
|
def process_span_metric_data(
|
|
244
|
-
metric_data: MetricData,
|
|
246
|
+
metric_data: MetricData,
|
|
247
|
+
span_type: span_api_type_literals,
|
|
248
|
+
span_name: str,
|
|
245
249
|
):
|
|
250
|
+
"""
|
|
251
|
+
Process and aggregate metric data for a specific span.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
metric_data: The metric data to process
|
|
255
|
+
span_type: The type of span (agent, tool, retriever, llm, base)
|
|
256
|
+
span_name: The name of the span
|
|
257
|
+
"""
|
|
246
258
|
metric_name = metric_data.name
|
|
247
259
|
score = metric_data.score
|
|
248
260
|
success = metric_data.success
|
|
249
261
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
trace_metrics_dict[type][name] = {}
|
|
262
|
+
if span_name not in trace_metrics_dict[span_type]:
|
|
263
|
+
trace_metrics_dict[span_type][span_name] = {}
|
|
253
264
|
|
|
254
|
-
if metric_name not in trace_metrics_dict[
|
|
255
|
-
trace_metrics_dict[
|
|
265
|
+
if metric_name not in trace_metrics_dict[span_type][span_name]:
|
|
266
|
+
trace_metrics_dict[span_type][span_name][metric_name] = {
|
|
256
267
|
"scores": [],
|
|
257
268
|
"passes": 0,
|
|
258
269
|
"fails": 0,
|
|
259
270
|
"errors": 0,
|
|
260
271
|
}
|
|
261
272
|
|
|
273
|
+
metric_dict = trace_metrics_dict[span_type][span_name][metric_name]
|
|
274
|
+
|
|
262
275
|
if score is None or success is None:
|
|
263
|
-
|
|
276
|
+
metric_dict["errors"] += 1
|
|
264
277
|
else:
|
|
265
|
-
|
|
266
|
-
trace_metrics_dict[type][name][metric_name]["scores"].append(
|
|
267
|
-
score
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
# Increment passes or fails
|
|
278
|
+
metric_dict["scores"].append(score)
|
|
271
279
|
if success:
|
|
272
|
-
|
|
280
|
+
metric_dict["passes"] += 1
|
|
273
281
|
else:
|
|
274
|
-
|
|
282
|
+
metric_dict["fails"] += 1
|
|
283
|
+
|
|
284
|
+
def process_spans(spans, span_type: span_api_type_literals):
|
|
285
|
+
"""
|
|
286
|
+
Process all metrics for a list of spans of a specific type.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
spans: List of spans to process
|
|
290
|
+
span_type: The type of spans being processed
|
|
291
|
+
"""
|
|
292
|
+
for span in spans:
|
|
293
|
+
if span.metrics_data is not None:
|
|
294
|
+
for metric_data in span.metrics_data:
|
|
295
|
+
process_metric_data(metric_data)
|
|
296
|
+
process_span_metric_data(
|
|
297
|
+
metric_data, span_type, span.name
|
|
298
|
+
)
|
|
275
299
|
|
|
276
300
|
# Process non-conversational test cases.
|
|
277
301
|
for test_case in self.test_cases:
|
|
@@ -283,45 +307,14 @@ class TestRun(BaseModel):
|
|
|
283
307
|
if test_case.trace is None:
|
|
284
308
|
continue
|
|
285
309
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
for span in test_case.trace.tool_spans:
|
|
295
|
-
if span.metrics_data is not None:
|
|
296
|
-
for metric_data in span.metrics_data:
|
|
297
|
-
process_metric_data(metric_data)
|
|
298
|
-
process_span_metric_data(
|
|
299
|
-
metric_data, SpanApiType.TOOL.value, span.name
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
for span in test_case.trace.retriever_spans:
|
|
303
|
-
if span.metrics_data is not None:
|
|
304
|
-
for metric_data in span.metrics_data:
|
|
305
|
-
process_metric_data(metric_data)
|
|
306
|
-
process_span_metric_data(
|
|
307
|
-
metric_data, SpanApiType.RETRIEVER.value, span.name
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
for span in test_case.trace.llm_spans:
|
|
311
|
-
if span.metrics_data is not None:
|
|
312
|
-
for metric_data in span.metrics_data:
|
|
313
|
-
process_metric_data(metric_data)
|
|
314
|
-
process_span_metric_data(
|
|
315
|
-
metric_data, SpanApiType.LLM.value, span.name
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
for span in test_case.trace.base_spans:
|
|
319
|
-
if span.metrics_data is not None:
|
|
320
|
-
for metric_data in span.metrics_data:
|
|
321
|
-
process_metric_data(metric_data)
|
|
322
|
-
process_span_metric_data(
|
|
323
|
-
metric_data, SpanApiType.BASE.value, span.name
|
|
324
|
-
)
|
|
310
|
+
# Process all span types using the helper function
|
|
311
|
+
process_spans(test_case.trace.agent_spans, SpanApiType.AGENT.value)
|
|
312
|
+
process_spans(test_case.trace.tool_spans, SpanApiType.TOOL.value)
|
|
313
|
+
process_spans(
|
|
314
|
+
test_case.trace.retriever_spans, SpanApiType.RETRIEVER.value
|
|
315
|
+
)
|
|
316
|
+
process_spans(test_case.trace.llm_spans, SpanApiType.LLM.value)
|
|
317
|
+
process_spans(test_case.trace.base_spans, SpanApiType.BASE.value)
|
|
325
318
|
|
|
326
319
|
# Process conversational test cases.
|
|
327
320
|
for convo_test_case in self.conversational_test_cases:
|
|
@@ -554,105 +547,141 @@ class TestRunManager:
|
|
|
554
547
|
def clear_test_run(self):
|
|
555
548
|
self.test_run = None
|
|
556
549
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
550
|
+
@staticmethod
|
|
551
|
+
def _calculate_success_rate(pass_count: int, fail_count: int) -> str:
|
|
552
|
+
"""Calculate success rate percentage or return error message."""
|
|
553
|
+
total = pass_count + fail_count
|
|
554
|
+
if total > 0:
|
|
555
|
+
return str(round((100 * pass_count) / total, 2))
|
|
556
|
+
return "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
|
|
557
|
+
|
|
558
|
+
@staticmethod
|
|
559
|
+
def _get_metric_status(metric_data: MetricData) -> str:
|
|
560
|
+
"""Get formatted status string for a metric."""
|
|
561
|
+
if metric_data.error:
|
|
562
|
+
return "[red]ERRORED[/red]"
|
|
563
|
+
elif metric_data.success:
|
|
564
|
+
return "[green]PASSED[/green]"
|
|
565
|
+
return "[red]FAILED[/red]"
|
|
566
|
+
|
|
567
|
+
@staticmethod
|
|
568
|
+
def _format_metric_score(metric_data: MetricData) -> str:
|
|
569
|
+
"""Format metric score with evaluation details."""
|
|
570
|
+
evaluation_model = metric_data.evaluation_model or "n/a"
|
|
571
|
+
metric_score = (
|
|
572
|
+
round(metric_data.score, 2)
|
|
573
|
+
if metric_data.score is not None
|
|
574
|
+
else None
|
|
575
|
+
)
|
|
570
576
|
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
577
|
+
return (
|
|
578
|
+
f"{metric_score} "
|
|
579
|
+
f"(threshold={metric_data.threshold}, "
|
|
580
|
+
f"evaluation model={evaluation_model}, "
|
|
581
|
+
f"reason={metric_data.reason}, "
|
|
582
|
+
f"error={metric_data.error})"
|
|
583
|
+
)
|
|
578
584
|
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
585
|
+
@staticmethod
|
|
586
|
+
def _should_skip_test_case(
|
|
587
|
+
test_case, display: TestRunResultDisplay
|
|
588
|
+
) -> bool:
|
|
589
|
+
"""Determine if test case should be skipped based on display filter."""
|
|
590
|
+
if display == TestRunResultDisplay.PASSING and not test_case.success:
|
|
591
|
+
return True
|
|
592
|
+
elif display == TestRunResultDisplay.FAILING and test_case.success:
|
|
593
|
+
return True
|
|
594
|
+
return False
|
|
595
|
+
|
|
596
|
+
@staticmethod
|
|
597
|
+
def _count_metric_results(
|
|
598
|
+
metrics_data: List[MetricData],
|
|
599
|
+
) -> tuple[int, int]:
|
|
600
|
+
"""Count passing and failing metrics."""
|
|
601
|
+
pass_count = 0
|
|
602
|
+
fail_count = 0
|
|
603
|
+
for metric_data in metrics_data:
|
|
604
|
+
if metric_data.success:
|
|
605
|
+
pass_count += 1
|
|
606
|
+
else:
|
|
607
|
+
fail_count += 1
|
|
608
|
+
return pass_count, fail_count
|
|
582
609
|
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
610
|
+
def _add_test_case_header_row(
|
|
611
|
+
self,
|
|
612
|
+
table: Table,
|
|
613
|
+
test_case_name: str,
|
|
614
|
+
pass_count: int,
|
|
615
|
+
fail_count: int,
|
|
616
|
+
):
|
|
617
|
+
"""Add test case header row with name and success rate."""
|
|
618
|
+
success_rate = self._calculate_success_rate(pass_count, fail_count)
|
|
619
|
+
table.add_row(
|
|
620
|
+
test_case_name,
|
|
621
|
+
*[""] * 3,
|
|
622
|
+
f"{success_rate}%",
|
|
623
|
+
)
|
|
586
624
|
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
625
|
+
def _add_metric_rows(self, table: Table, metrics_data: List[MetricData]):
|
|
626
|
+
"""Add metric detail rows to the table."""
|
|
627
|
+
for metric_data in metrics_data:
|
|
628
|
+
status = self._get_metric_status(metric_data)
|
|
629
|
+
formatted_score = self._format_metric_score(metric_data)
|
|
592
630
|
|
|
593
|
-
success_rate = (
|
|
594
|
-
round((100 * pass_count) / (pass_count + fail_count), 2)
|
|
595
|
-
if pass_count + fail_count > 0
|
|
596
|
-
else "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
|
|
597
|
-
)
|
|
598
631
|
table.add_row(
|
|
599
|
-
test_case_name,
|
|
600
632
|
"",
|
|
633
|
+
str(metric_data.name),
|
|
634
|
+
formatted_score,
|
|
635
|
+
status,
|
|
601
636
|
"",
|
|
602
|
-
"",
|
|
603
|
-
f"{success_rate}%",
|
|
604
637
|
)
|
|
605
638
|
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
elif metric_data.success:
|
|
610
|
-
status = "[green]PASSED[/green]"
|
|
611
|
-
else:
|
|
612
|
-
status = "[red]FAILED[/red]"
|
|
639
|
+
def _add_separator_row(self, table: Table):
|
|
640
|
+
"""Add empty separator row between test cases."""
|
|
641
|
+
table.add_row(*[""] * len(table.columns))
|
|
613
642
|
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
643
|
+
def display_results_table(
|
|
644
|
+
self, test_run: TestRun, display: TestRunResultDisplay
|
|
645
|
+
):
|
|
646
|
+
"""Display test results in a formatted table."""
|
|
617
647
|
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
status,
|
|
628
|
-
"",
|
|
629
|
-
)
|
|
648
|
+
table = Table(title="Test Results")
|
|
649
|
+
column_config = dict(justify="left")
|
|
650
|
+
column_names = [
|
|
651
|
+
"Test case",
|
|
652
|
+
"Metric",
|
|
653
|
+
"Score",
|
|
654
|
+
"Status",
|
|
655
|
+
"Overall Success Rate",
|
|
656
|
+
]
|
|
630
657
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
"",
|
|
634
|
-
"",
|
|
635
|
-
"",
|
|
636
|
-
"",
|
|
637
|
-
"",
|
|
638
|
-
)
|
|
658
|
+
for name in column_names:
|
|
659
|
+
table.add_column(name, **column_config)
|
|
639
660
|
|
|
661
|
+
# Process regular test cases
|
|
662
|
+
for index, test_case in enumerate(test_run.test_cases):
|
|
663
|
+
if test_case.metrics_data is None or self._should_skip_test_case(
|
|
664
|
+
test_case, display
|
|
665
|
+
):
|
|
666
|
+
continue
|
|
667
|
+
pass_count, fail_count = self._count_metric_results(
|
|
668
|
+
test_case.metrics_data
|
|
669
|
+
)
|
|
670
|
+
self._add_test_case_header_row(
|
|
671
|
+
table, test_case.name, pass_count, fail_count
|
|
672
|
+
)
|
|
673
|
+
self._add_metric_rows(table, test_case.metrics_data)
|
|
674
|
+
|
|
675
|
+
if index < len(test_run.test_cases) - 1:
|
|
676
|
+
self._add_separator_row(table)
|
|
677
|
+
|
|
678
|
+
# Process conversational test cases
|
|
640
679
|
for index, conversational_test_case in enumerate(
|
|
641
680
|
test_run.conversational_test_cases
|
|
642
681
|
):
|
|
643
|
-
if (
|
|
644
|
-
display == TestRunResultDisplay.PASSING
|
|
645
|
-
and conversational_test_case.success is False
|
|
646
|
-
):
|
|
647
|
-
continue
|
|
648
|
-
elif (
|
|
649
|
-
display == TestRunResultDisplay.FAILING
|
|
650
|
-
and conversational_test_case.success
|
|
651
|
-
):
|
|
682
|
+
if self._should_skip_test_case(conversational_test_case, display):
|
|
652
683
|
continue
|
|
653
684
|
|
|
654
|
-
pass_count = 0
|
|
655
|
-
fail_count = 0
|
|
656
685
|
conversational_test_case_name = conversational_test_case.name
|
|
657
686
|
|
|
658
687
|
if conversational_test_case.turns:
|
|
@@ -713,71 +742,26 @@ class TestRunManager:
|
|
|
713
742
|
console.print(
|
|
714
743
|
f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
|
|
715
744
|
)
|
|
716
|
-
|
|
717
745
|
if conversational_test_case.metrics_data is not None:
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
pass_count += 1
|
|
721
|
-
else:
|
|
722
|
-
fail_count += 1
|
|
723
|
-
table.add_row(
|
|
724
|
-
conversational_test_case_name,
|
|
725
|
-
"",
|
|
726
|
-
"",
|
|
727
|
-
"",
|
|
728
|
-
f"{round((100*pass_count)/(pass_count+fail_count),2)}%",
|
|
746
|
+
pass_count, fail_count = self._count_metric_results(
|
|
747
|
+
conversational_test_case.metrics_data
|
|
729
748
|
)
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
for metric_data in conversational_test_case.metrics_data:
|
|
733
|
-
if metric_data.error:
|
|
734
|
-
status = "[red]ERRORED[/red]"
|
|
735
|
-
elif metric_data.success:
|
|
736
|
-
status = "[green]PASSED[/green]"
|
|
737
|
-
else:
|
|
738
|
-
status = "[red]FAILED[/red]"
|
|
739
|
-
|
|
740
|
-
evaluation_model = metric_data.evaluation_model
|
|
741
|
-
if evaluation_model is None:
|
|
742
|
-
evaluation_model = "n/a"
|
|
743
|
-
|
|
744
|
-
if metric_data.score is not None:
|
|
745
|
-
metric_score = round(metric_data.score, 2)
|
|
746
|
-
else:
|
|
747
|
-
metric_score = None
|
|
748
|
-
|
|
749
|
-
table.add_row(
|
|
750
|
-
"",
|
|
751
|
-
str(metric_data.name),
|
|
752
|
-
f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
|
|
753
|
-
status,
|
|
754
|
-
"",
|
|
755
|
-
)
|
|
756
|
-
|
|
757
|
-
if index is not len(self.test_run.conversational_test_cases) - 1:
|
|
758
|
-
table.add_row(
|
|
759
|
-
"",
|
|
760
|
-
"",
|
|
761
|
-
"",
|
|
762
|
-
"",
|
|
763
|
-
"",
|
|
749
|
+
self._add_test_case_header_row(
|
|
750
|
+
table, conversational_test_case.name, pass_count, fail_count
|
|
764
751
|
)
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
table.add_row(
|
|
768
|
-
"",
|
|
769
|
-
"",
|
|
770
|
-
"",
|
|
771
|
-
"",
|
|
772
|
-
"",
|
|
752
|
+
self._add_metric_rows(
|
|
753
|
+
table, conversational_test_case.metrics_data
|
|
773
754
|
)
|
|
774
755
|
|
|
756
|
+
if index < len(test_run.conversational_test_cases) - 1:
|
|
757
|
+
self._add_separator_row(table)
|
|
758
|
+
|
|
759
|
+
if index < len(test_run.test_cases) - 1:
|
|
760
|
+
self._add_separator_row(table)
|
|
761
|
+
|
|
775
762
|
table.add_row(
|
|
776
763
|
"[bold red]Note: Use Confident AI with DeepEval to analyze failed test cases for more details[/bold red]",
|
|
777
|
-
"",
|
|
778
|
-
"",
|
|
779
|
-
"",
|
|
780
|
-
"",
|
|
764
|
+
*[""] * (len(table.columns) - 1),
|
|
781
765
|
)
|
|
782
766
|
print(table)
|
|
783
767
|
|
|
@@ -970,7 +954,6 @@ class TestRunManager:
|
|
|
970
954
|
global_test_run_cache_manager.disable_write_cache = not bool(
|
|
971
955
|
get_is_running_deepeval()
|
|
972
956
|
)
|
|
973
|
-
|
|
974
957
|
global_test_run_cache_manager.wrap_up_cached_test_run()
|
|
975
958
|
|
|
976
959
|
if display_table:
|
deepeval/tracing/__init__.py
CHANGED
|
@@ -4,7 +4,7 @@ from .context import (
|
|
|
4
4
|
update_retriever_span,
|
|
5
5
|
update_llm_span,
|
|
6
6
|
)
|
|
7
|
-
from .trace_context import trace
|
|
7
|
+
from .trace_context import trace, LlmSpanContext
|
|
8
8
|
from .types import BaseSpan, Trace
|
|
9
9
|
from .tracing import observe, trace_manager
|
|
10
10
|
from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
|
|
@@ -14,6 +14,7 @@ __all__ = [
|
|
|
14
14
|
"update_current_trace",
|
|
15
15
|
"update_retriever_span",
|
|
16
16
|
"update_llm_span",
|
|
17
|
+
"LlmSpanContext",
|
|
17
18
|
"BaseSpan",
|
|
18
19
|
"Trace",
|
|
19
20
|
"observe",
|
|
@@ -254,7 +254,7 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
254
254
|
|
|
255
255
|
base_span = None
|
|
256
256
|
try:
|
|
257
|
-
base_span = self.
|
|
257
|
+
base_span = self.prepare_boilerplate_base_span(span)
|
|
258
258
|
except Exception:
|
|
259
259
|
pass
|
|
260
260
|
|
|
@@ -453,9 +453,8 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
453
453
|
if span_output:
|
|
454
454
|
base_span.output = span_output
|
|
455
455
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
) -> Optional[BaseSpan]:
|
|
456
|
+
@staticmethod
|
|
457
|
+
def prepare_boilerplate_base_span(span: ReadableSpan) -> Optional[BaseSpan]:
|
|
459
458
|
|
|
460
459
|
################ Get Span Type ################
|
|
461
460
|
span_type = span.attributes.get("confident.span.type")
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -11,7 +11,7 @@ from deepeval.tracing import trace_manager, BaseSpan
|
|
|
11
11
|
from deepeval.tracing.utils import make_json_serializable
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "
|
|
14
|
+
GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "text_completion"]
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def to_hex_string(id_value: int | bytes, length: int = 32) -> str:
|
|
@@ -128,6 +128,10 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
128
128
|
|
|
129
129
|
input = system_instructions + input_messages
|
|
130
130
|
|
|
131
|
+
model_parameters = check_model_parameters(span)
|
|
132
|
+
if model_parameters:
|
|
133
|
+
input.append(model_parameters)
|
|
134
|
+
|
|
131
135
|
except Exception:
|
|
132
136
|
pass
|
|
133
137
|
try:
|
|
@@ -413,7 +417,7 @@ def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
|
|
|
413
417
|
# return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
|
|
414
418
|
|
|
415
419
|
|
|
416
|
-
def
|
|
420
|
+
def normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
|
|
417
421
|
try:
|
|
418
422
|
raw = span.attributes.get("pydantic_ai.all_messages")
|
|
419
423
|
if not raw:
|
|
@@ -438,7 +442,7 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
|
|
|
438
442
|
except Exception:
|
|
439
443
|
pass
|
|
440
444
|
|
|
441
|
-
return
|
|
445
|
+
return []
|
|
442
446
|
|
|
443
447
|
|
|
444
448
|
def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
|
|
@@ -461,7 +465,7 @@ def check_pydantic_ai_agent_input_output(
|
|
|
461
465
|
output_val: Optional[Any] = None
|
|
462
466
|
|
|
463
467
|
# Get normalized messages once
|
|
464
|
-
normalized =
|
|
468
|
+
normalized = normalize_pydantic_ai_messages(span)
|
|
465
469
|
|
|
466
470
|
# Input (pydantic_ai.all_messages) - slice up to and including the first 'user' message
|
|
467
471
|
if normalized:
|
|
@@ -523,3 +527,18 @@ def check_pydantic_ai_trace_input_output(
|
|
|
523
527
|
input_val, output_val = check_pydantic_ai_agent_input_output(span)
|
|
524
528
|
|
|
525
529
|
return input_val, output_val
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def check_model_parameters(span: ReadableSpan) -> Optional[dict]:
|
|
533
|
+
try:
|
|
534
|
+
raw_model_parameters = span.attributes.get("model_request_parameters")
|
|
535
|
+
if raw_model_parameters and isinstance(raw_model_parameters, str):
|
|
536
|
+
model_parameters = json.loads(raw_model_parameters)
|
|
537
|
+
if isinstance(model_parameters, dict):
|
|
538
|
+
return {
|
|
539
|
+
"role": "Model Request Parameters",
|
|
540
|
+
"content": model_parameters,
|
|
541
|
+
}
|
|
542
|
+
except Exception:
|
|
543
|
+
pass
|
|
544
|
+
return None
|