deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -32,6 +32,15 @@ from deepeval.utils import (
32
32
  )
33
33
  from deepeval.test_run.cache import global_test_run_cache_manager
34
34
  from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
35
+ from deepeval.prompt import (
36
+ PromptMessage,
37
+ ModelSettings,
38
+ PromptInterpolationType,
39
+ OutputType,
40
+ )
41
+ from rich.panel import Panel
42
+ from rich.columns import Columns
43
+
35
44
 
36
45
  TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
37
46
  LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
@@ -71,6 +80,16 @@ class TraceMetricScores(BaseModel):
71
80
  base: Dict[str, Dict[str, MetricScores]] = Field(default_factory=dict)
72
81
 
73
82
 
83
+ class PromptData(BaseModel):
84
+ alias: Optional[str] = None
85
+ version: Optional[str] = None
86
+ text_template: Optional[str] = None
87
+ messages_template: Optional[List[PromptMessage]] = None
88
+ model_settings: Optional[ModelSettings] = None
89
+ output_type: Optional[OutputType] = None
90
+ interpolation_type: Optional[PromptInterpolationType] = None
91
+
92
+
74
93
  class MetricsAverageDict:
75
94
  def __init__(self):
76
95
  self.metric_dict = {}
@@ -123,6 +142,7 @@ class TestRun(BaseModel):
123
142
  )
124
143
  identifier: Optional[str] = None
125
144
  hyperparameters: Optional[Dict[str, Any]] = Field(None)
145
+ prompts: Optional[List[PromptData]] = Field(None)
126
146
  test_passed: Optional[int] = Field(None, alias="testPassed")
127
147
  test_failed: Optional[int] = Field(None, alias="testFailed")
128
148
  run_duration: float = Field(0.0, alias="runDuration")
@@ -191,65 +211,91 @@ class TestRun(BaseModel):
191
211
  valid_scores = 0
192
212
 
193
213
  def process_metric_data(metric_data: MetricData):
214
+ """
215
+ Process and aggregate metric data for overall test metrics.
216
+
217
+ Args:
218
+ metric_data: The metric data to process
219
+ """
194
220
  nonlocal valid_scores
195
- name = metric_data.name
221
+ metric_name = metric_data.name
196
222
  score = metric_data.score
197
223
  success = metric_data.success
198
- # Initialize dict entry if needed.
199
- if name not in metrics_dict:
200
- metrics_dict[name] = {
224
+
225
+ if metric_name not in metrics_dict:
226
+ metrics_dict[metric_name] = {
201
227
  "scores": [],
202
228
  "passes": 0,
203
229
  "fails": 0,
204
230
  "errors": 0,
205
231
  }
206
232
 
233
+ metric_dict = metrics_dict[metric_name]
234
+
207
235
  if score is None or success is None:
208
- metrics_dict[name]["errors"] += 1
236
+ metric_dict["errors"] += 1
209
237
  else:
210
238
  valid_scores += 1
211
-
212
- # Append the score.
213
- metrics_dict[name]["scores"].append(score)
214
-
215
- # Increment passes or fails based on the metric_data.success flag.
239
+ metric_dict["scores"].append(score)
216
240
  if success:
217
- metrics_dict[name]["passes"] += 1
241
+ metric_dict["passes"] += 1
218
242
  else:
219
- metrics_dict[name]["fails"] += 1
243
+ metric_dict["fails"] += 1
220
244
 
221
245
  def process_span_metric_data(
222
- metric_data: MetricData, type: span_api_type_literals, name: str
246
+ metric_data: MetricData,
247
+ span_type: span_api_type_literals,
248
+ span_name: str,
223
249
  ):
250
+ """
251
+ Process and aggregate metric data for a specific span.
252
+
253
+ Args:
254
+ metric_data: The metric data to process
255
+ span_type: The type of span (agent, tool, retriever, llm, base)
256
+ span_name: The name of the span
257
+ """
224
258
  metric_name = metric_data.name
225
259
  score = metric_data.score
226
260
  success = metric_data.success
227
261
 
228
- # Initialize the structure if needed
229
- if name not in trace_metrics_dict[type]:
230
- trace_metrics_dict[type][name] = {}
262
+ if span_name not in trace_metrics_dict[span_type]:
263
+ trace_metrics_dict[span_type][span_name] = {}
231
264
 
232
- if metric_name not in trace_metrics_dict[type][name]:
233
- trace_metrics_dict[type][name][metric_name] = {
265
+ if metric_name not in trace_metrics_dict[span_type][span_name]:
266
+ trace_metrics_dict[span_type][span_name][metric_name] = {
234
267
  "scores": [],
235
268
  "passes": 0,
236
269
  "fails": 0,
237
270
  "errors": 0,
238
271
  }
239
272
 
273
+ metric_dict = trace_metrics_dict[span_type][span_name][metric_name]
274
+
240
275
  if score is None or success is None:
241
- trace_metrics_dict[type][name][metric_name]["errors"] += 1
276
+ metric_dict["errors"] += 1
242
277
  else:
243
- # Append the score
244
- trace_metrics_dict[type][name][metric_name]["scores"].append(
245
- score
246
- )
247
-
248
- # Increment passes or fails
278
+ metric_dict["scores"].append(score)
249
279
  if success:
250
- trace_metrics_dict[type][name][metric_name]["passes"] += 1
280
+ metric_dict["passes"] += 1
251
281
  else:
252
- trace_metrics_dict[type][name][metric_name]["fails"] += 1
282
+ metric_dict["fails"] += 1
283
+
284
+ def process_spans(spans, span_type: span_api_type_literals):
285
+ """
286
+ Process all metrics for a list of spans of a specific type.
287
+
288
+ Args:
289
+ spans: List of spans to process
290
+ span_type: The type of spans being processed
291
+ """
292
+ for span in spans:
293
+ if span.metrics_data is not None:
294
+ for metric_data in span.metrics_data:
295
+ process_metric_data(metric_data)
296
+ process_span_metric_data(
297
+ metric_data, span_type, span.name
298
+ )
253
299
 
254
300
  # Process non-conversational test cases.
255
301
  for test_case in self.test_cases:
@@ -261,45 +307,14 @@ class TestRun(BaseModel):
261
307
  if test_case.trace is None:
262
308
  continue
263
309
 
264
- for span in test_case.trace.agent_spans:
265
- if span.metrics_data is not None:
266
- for metric_data in span.metrics_data:
267
- process_metric_data(metric_data)
268
- process_span_metric_data(
269
- metric_data, SpanApiType.AGENT.value, span.name
270
- )
271
-
272
- for span in test_case.trace.tool_spans:
273
- if span.metrics_data is not None:
274
- for metric_data in span.metrics_data:
275
- process_metric_data(metric_data)
276
- process_span_metric_data(
277
- metric_data, SpanApiType.TOOL.value, span.name
278
- )
279
-
280
- for span in test_case.trace.retriever_spans:
281
- if span.metrics_data is not None:
282
- for metric_data in span.metrics_data:
283
- process_metric_data(metric_data)
284
- process_span_metric_data(
285
- metric_data, SpanApiType.RETRIEVER.value, span.name
286
- )
287
-
288
- for span in test_case.trace.llm_spans:
289
- if span.metrics_data is not None:
290
- for metric_data in span.metrics_data:
291
- process_metric_data(metric_data)
292
- process_span_metric_data(
293
- metric_data, SpanApiType.LLM.value, span.name
294
- )
295
-
296
- for span in test_case.trace.base_spans:
297
- if span.metrics_data is not None:
298
- for metric_data in span.metrics_data:
299
- process_metric_data(metric_data)
300
- process_span_metric_data(
301
- metric_data, SpanApiType.BASE.value, span.name
302
- )
310
+ # Process all span types using the helper function
311
+ process_spans(test_case.trace.agent_spans, SpanApiType.AGENT.value)
312
+ process_spans(test_case.trace.tool_spans, SpanApiType.TOOL.value)
313
+ process_spans(
314
+ test_case.trace.retriever_spans, SpanApiType.RETRIEVER.value
315
+ )
316
+ process_spans(test_case.trace.llm_spans, SpanApiType.LLM.value)
317
+ process_spans(test_case.trace.base_spans, SpanApiType.BASE.value)
303
318
 
304
319
  # Process conversational test cases.
305
320
  for convo_test_case in self.conversational_test_cases:
@@ -532,105 +547,141 @@ class TestRunManager:
532
547
  def clear_test_run(self):
533
548
  self.test_run = None
534
549
 
535
- def display_results_table(
536
- self, test_run: TestRun, display: TestRunResultDisplay
537
- ):
538
- table = Table(title="Test Results")
539
- table.add_column("Test case", justify="left")
540
- table.add_column("Metric", justify="left")
541
- table.add_column("Score", justify="left")
542
- table.add_column("Status", justify="left")
543
- table.add_column("Overall Success Rate", justify="left")
544
-
545
- for index, test_case in enumerate(test_run.test_cases):
546
- if test_case.metrics_data is None:
547
- continue
550
+ @staticmethod
551
+ def _calculate_success_rate(pass_count: int, fail_count: int) -> str:
552
+ """Calculate success rate percentage or return error message."""
553
+ total = pass_count + fail_count
554
+ if total > 0:
555
+ return str(round((100 * pass_count) / total, 2))
556
+ return "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
557
+
558
+ @staticmethod
559
+ def _get_metric_status(metric_data: MetricData) -> str:
560
+ """Get formatted status string for a metric."""
561
+ if metric_data.error:
562
+ return "[red]ERRORED[/red]"
563
+ elif metric_data.success:
564
+ return "[green]PASSED[/green]"
565
+ return "[red]FAILED[/red]"
566
+
567
+ @staticmethod
568
+ def _format_metric_score(metric_data: MetricData) -> str:
569
+ """Format metric score with evaluation details."""
570
+ evaluation_model = metric_data.evaluation_model or "n/a"
571
+ metric_score = (
572
+ round(metric_data.score, 2)
573
+ if metric_data.score is not None
574
+ else None
575
+ )
548
576
 
549
- if (
550
- display == TestRunResultDisplay.PASSING
551
- and test_case.success is False
552
- ):
553
- continue
554
- elif display == TestRunResultDisplay.FAILING and test_case.success:
555
- continue
577
+ return (
578
+ f"{metric_score} "
579
+ f"(threshold={metric_data.threshold}, "
580
+ f"evaluation model={evaluation_model}, "
581
+ f"reason={metric_data.reason}, "
582
+ f"error={metric_data.error})"
583
+ )
556
584
 
557
- pass_count = 0
558
- fail_count = 0
559
- test_case_name = test_case.name
585
+ @staticmethod
586
+ def _should_skip_test_case(
587
+ test_case, display: TestRunResultDisplay
588
+ ) -> bool:
589
+ """Determine if test case should be skipped based on display filter."""
590
+ if display == TestRunResultDisplay.PASSING and not test_case.success:
591
+ return True
592
+ elif display == TestRunResultDisplay.FAILING and test_case.success:
593
+ return True
594
+ return False
595
+
596
+ @staticmethod
597
+ def _count_metric_results(
598
+ metrics_data: List[MetricData],
599
+ ) -> tuple[int, int]:
600
+ """Count passing and failing metrics."""
601
+ pass_count = 0
602
+ fail_count = 0
603
+ for metric_data in metrics_data:
604
+ if metric_data.success:
605
+ pass_count += 1
606
+ else:
607
+ fail_count += 1
608
+ return pass_count, fail_count
560
609
 
561
- # TODO: recursively iterate through it to calculate pass and fail count
562
- if test_case.trace:
563
- pass
610
+ def _add_test_case_header_row(
611
+ self,
612
+ table: Table,
613
+ test_case_name: str,
614
+ pass_count: int,
615
+ fail_count: int,
616
+ ):
617
+ """Add test case header row with name and success rate."""
618
+ success_rate = self._calculate_success_rate(pass_count, fail_count)
619
+ table.add_row(
620
+ test_case_name,
621
+ *[""] * 3,
622
+ f"{success_rate}%",
623
+ )
564
624
 
565
- for metric_data in test_case.metrics_data:
566
- if metric_data.success:
567
- pass_count += 1
568
- else:
569
- fail_count += 1
625
+ def _add_metric_rows(self, table: Table, metrics_data: List[MetricData]):
626
+ """Add metric detail rows to the table."""
627
+ for metric_data in metrics_data:
628
+ status = self._get_metric_status(metric_data)
629
+ formatted_score = self._format_metric_score(metric_data)
570
630
 
571
- success_rate = (
572
- round((100 * pass_count) / (pass_count + fail_count), 2)
573
- if pass_count + fail_count > 0
574
- else "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
575
- )
576
631
  table.add_row(
577
- test_case_name,
578
632
  "",
633
+ str(metric_data.name),
634
+ formatted_score,
635
+ status,
579
636
  "",
580
- "",
581
- f"{success_rate}%",
582
637
  )
583
638
 
584
- for metric_data in test_case.metrics_data:
585
- if metric_data.error:
586
- status = "[red]ERRORED[/red]"
587
- elif metric_data.success:
588
- status = "[green]PASSED[/green]"
589
- else:
590
- status = "[red]FAILED[/red]"
639
+ def _add_separator_row(self, table: Table):
640
+ """Add empty separator row between test cases."""
641
+ table.add_row(*[""] * len(table.columns))
591
642
 
592
- evaluation_model = metric_data.evaluation_model
593
- if evaluation_model is None:
594
- evaluation_model = "n/a"
643
+ def display_results_table(
644
+ self, test_run: TestRun, display: TestRunResultDisplay
645
+ ):
646
+ """Display test results in a formatted table."""
595
647
 
596
- if metric_data.score is not None:
597
- metric_score = round(metric_data.score, 2)
598
- else:
599
- metric_score = None
600
-
601
- table.add_row(
602
- "",
603
- str(metric_data.name),
604
- f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
605
- status,
606
- "",
607
- )
648
+ table = Table(title="Test Results")
649
+ column_config = dict(justify="left")
650
+ column_names = [
651
+ "Test case",
652
+ "Metric",
653
+ "Score",
654
+ "Status",
655
+ "Overall Success Rate",
656
+ ]
608
657
 
609
- if index is not len(self.test_run.test_cases) - 1:
610
- table.add_row(
611
- "",
612
- "",
613
- "",
614
- "",
615
- "",
616
- )
658
+ for name in column_names:
659
+ table.add_column(name, **column_config)
617
660
 
661
+ # Process regular test cases
662
+ for index, test_case in enumerate(test_run.test_cases):
663
+ if test_case.metrics_data is None or self._should_skip_test_case(
664
+ test_case, display
665
+ ):
666
+ continue
667
+ pass_count, fail_count = self._count_metric_results(
668
+ test_case.metrics_data
669
+ )
670
+ self._add_test_case_header_row(
671
+ table, test_case.name, pass_count, fail_count
672
+ )
673
+ self._add_metric_rows(table, test_case.metrics_data)
674
+
675
+ if index < len(test_run.test_cases) - 1:
676
+ self._add_separator_row(table)
677
+
678
+ # Process conversational test cases
618
679
  for index, conversational_test_case in enumerate(
619
680
  test_run.conversational_test_cases
620
681
  ):
621
- if (
622
- display == TestRunResultDisplay.PASSING
623
- and conversational_test_case.success is False
624
- ):
625
- continue
626
- elif (
627
- display == TestRunResultDisplay.FAILING
628
- and conversational_test_case.success
629
- ):
682
+ if self._should_skip_test_case(conversational_test_case, display):
630
683
  continue
631
684
 
632
- pass_count = 0
633
- fail_count = 0
634
685
  conversational_test_case_name = conversational_test_case.name
635
686
 
636
687
  if conversational_test_case.turns:
@@ -691,71 +742,26 @@ class TestRunManager:
691
742
  console.print(
692
743
  f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
693
744
  )
694
-
695
745
  if conversational_test_case.metrics_data is not None:
696
- for metric_data in conversational_test_case.metrics_data:
697
- if metric_data.success:
698
- pass_count += 1
699
- else:
700
- fail_count += 1
701
- table.add_row(
702
- conversational_test_case_name,
703
- "",
704
- "",
705
- "",
706
- f"{round((100*pass_count)/(pass_count+fail_count),2)}%",
746
+ pass_count, fail_count = self._count_metric_results(
747
+ conversational_test_case.metrics_data
707
748
  )
708
-
709
- if conversational_test_case.metrics_data is not None:
710
- for metric_data in conversational_test_case.metrics_data:
711
- if metric_data.error:
712
- status = "[red]ERRORED[/red]"
713
- elif metric_data.success:
714
- status = "[green]PASSED[/green]"
715
- else:
716
- status = "[red]FAILED[/red]"
717
-
718
- evaluation_model = metric_data.evaluation_model
719
- if evaluation_model is None:
720
- evaluation_model = "n/a"
721
-
722
- if metric_data.score is not None:
723
- metric_score = round(metric_data.score, 2)
724
- else:
725
- metric_score = None
726
-
727
- table.add_row(
728
- "",
729
- str(metric_data.name),
730
- f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
731
- status,
732
- "",
733
- )
734
-
735
- if index is not len(self.test_run.conversational_test_cases) - 1:
736
- table.add_row(
737
- "",
738
- "",
739
- "",
740
- "",
741
- "",
749
+ self._add_test_case_header_row(
750
+ table, conversational_test_case.name, pass_count, fail_count
742
751
  )
743
-
744
- if index is not len(self.test_run.test_cases) - 1:
745
- table.add_row(
746
- "",
747
- "",
748
- "",
749
- "",
750
- "",
752
+ self._add_metric_rows(
753
+ table, conversational_test_case.metrics_data
751
754
  )
752
755
 
756
+ if index < len(test_run.conversational_test_cases) - 1:
757
+ self._add_separator_row(table)
758
+
759
+ if index < len(test_run.test_cases) - 1:
760
+ self._add_separator_row(table)
761
+
753
762
  table.add_row(
754
763
  "[bold red]Note: Use Confident AI with DeepEval to analyze failed test cases for more details[/bold red]",
755
- "",
756
- "",
757
- "",
758
- "",
764
+ *[""] * (len(table.columns) - 1),
759
765
  )
760
766
  print(table)
761
767
 
@@ -799,6 +805,7 @@ class TestRunManager:
799
805
  test_run.test_cases = initial_batch
800
806
 
801
807
  try:
808
+ test_run.prompts = None
802
809
  body = test_run.model_dump(by_alias=True, exclude_none=True)
803
810
  except AttributeError:
804
811
  # Pydantic version below 2.0
@@ -947,12 +954,28 @@ class TestRunManager:
947
954
  global_test_run_cache_manager.disable_write_cache = not bool(
948
955
  get_is_running_deepeval()
949
956
  )
950
-
951
957
  global_test_run_cache_manager.wrap_up_cached_test_run()
952
958
 
953
959
  if display_table:
954
960
  self.display_results_table(test_run, display)
955
961
 
962
+ if test_run.hyperparameters is None:
963
+ console.print(
964
+ "\n[bold yellow]⚠ WARNING:[/bold yellow] No hyperparameters logged.\n"
965
+ "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log hyperparameters[/link][/bold blue] to attribute prompts and models to your test runs.\n\n"
966
+ + "=" * 80
967
+ )
968
+ else:
969
+ if not test_run.prompts:
970
+ console.print(
971
+ "\n[bold yellow]⚠ WARNING:[/bold yellow] No prompts logged.\n"
972
+ "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log prompts[/link][/bold blue] to evaluate and optimize your prompt templates and models.\n\n"
973
+ + "=" * 80
974
+ )
975
+ else:
976
+ console.print("\n[bold green]✓ Prompts Logged[/bold green]\n")
977
+ self._render_prompts_panels(prompts=test_run.prompts)
978
+
956
979
  self.save_test_run_locally()
957
980
  delete_file_if_exists(self.temp_file_path)
958
981
  if is_confident() and self.disable_request is False:
@@ -967,7 +990,7 @@ class TestRunManager:
967
990
  f"» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
968
991
  f" » Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
969
992
  "=" * 80,
970
- "\n\n» What to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
993
+ "\n\n» Want to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
971
994
  " » Run [bold]'deepeval view'[/bold] to analyze and save testing results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n",
972
995
  )
973
996
 
@@ -993,5 +1016,68 @@ class TestRunManager:
993
1016
  pass
994
1017
  return None
995
1018
 
1019
+ def _render_prompts_panels(self, prompts: List[PromptData]) -> None:
1020
+
1021
+ def format_string(
1022
+ v, default="[dim]None[/dim]", color: Optional[str] = None
1023
+ ):
1024
+ formatted_string = str(v) if v not in (None, "", []) else default
1025
+ return (
1026
+ f"{formatted_string}"
1027
+ if color is None or v in (None, "", [])
1028
+ else f"[{color}]{formatted_string}[/]"
1029
+ )
1030
+
1031
+ panels = []
1032
+ for prompt in prompts:
1033
+ lines = []
1034
+ p_type = (
1035
+ "messages"
1036
+ if prompt.messages_template
1037
+ else ("text" if prompt.text_template else "—")
1038
+ )
1039
+ if p_type:
1040
+ lines.append(f"type: {format_string(p_type, color='blue')}")
1041
+ if prompt.output_type:
1042
+ lines.append(
1043
+ f"output_type: {format_string(prompt.output_type, color='blue')}"
1044
+ )
1045
+ if prompt.interpolation_type:
1046
+ lines.append(
1047
+ f"interpolation_type: {format_string(prompt.interpolation_type, color='blue')}"
1048
+ )
1049
+ if prompt.model_settings:
1050
+ ms = prompt.model_settings
1051
+ settings_lines = [
1052
+ "Model Settings:",
1053
+ f" – provider: {format_string(ms.provider, color='green')}",
1054
+ f" – name: {format_string(ms.name, color='green')}",
1055
+ f" – temperature: {format_string(ms.temperature, color='green')}",
1056
+ f" – max_tokens: {format_string(ms.max_tokens, color='green')}",
1057
+ f" – top_p: {format_string(ms.top_p, color='green')}",
1058
+ f" – frequency_penalty: {format_string(ms.frequency_penalty, color='green')}",
1059
+ f" – presence_penalty: {format_string(ms.presence_penalty, color='green')}",
1060
+ f" – stop_sequence: {format_string(ms.stop_sequence, color='green')}",
1061
+ f" – reasoning_effort: {format_string(ms.reasoning_effort, color='green')}",
1062
+ f" – verbosity: {format_string(ms.verbosity, color='green')}",
1063
+ ]
1064
+ lines.append("")
1065
+ lines.extend(settings_lines)
1066
+ title = f"{format_string(prompt.alias)}"
1067
+ if prompt.version:
1068
+ title += f" (v{prompt.version})"
1069
+ body = "\n".join(lines)
1070
+ panel = Panel(
1071
+ body,
1072
+ title=title,
1073
+ title_align="left",
1074
+ expand=False,
1075
+ padding=(1, 6, 1, 2),
1076
+ )
1077
+ panels.append(panel)
1078
+
1079
+ if panels:
1080
+ console.print(Columns(panels, equal=False, expand=False))
1081
+
996
1082
 
997
1083
  global_test_run_manager = TestRunManager()
@@ -4,7 +4,7 @@ from .context import (
4
4
  update_retriever_span,
5
5
  update_llm_span,
6
6
  )
7
- from .trace_context import trace
7
+ from .trace_context import trace, LlmSpanContext
8
8
  from .types import BaseSpan, Trace
9
9
  from .tracing import observe, trace_manager
10
10
  from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
@@ -14,6 +14,7 @@ __all__ = [
14
14
  "update_current_trace",
15
15
  "update_retriever_span",
16
16
  "update_llm_span",
17
+ "LlmSpanContext",
17
18
  "BaseSpan",
18
19
  "Trace",
19
20
  "observe",
deepeval/tracing/api.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from enum import Enum
2
2
  from typing import Dict, List, Optional, Union, Literal, Any
3
- from pydantic import BaseModel, Field
3
+ from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
5
  from deepeval.test_case import ToolCall
6
6
 
@@ -27,6 +27,8 @@ class PromptApi(BaseModel):
27
27
 
28
28
 
29
29
  class MetricData(BaseModel):
30
+ model_config = ConfigDict(extra="ignore")
31
+
30
32
  name: str
31
33
  threshold: float
32
34
  success: bool