deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/errors.py +20 -2
  3. deepeval/evaluate/execute.py +725 -217
  4. deepeval/evaluate/types.py +1 -0
  5. deepeval/evaluate/utils.py +13 -3
  6. deepeval/integrations/crewai/__init__.py +2 -1
  7. deepeval/integrations/crewai/tool.py +71 -0
  8. deepeval/integrations/llama_index/__init__.py +0 -4
  9. deepeval/integrations/llama_index/handler.py +20 -21
  10. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  11. deepeval/metrics/__init__.py +13 -0
  12. deepeval/metrics/base_metric.py +1 -0
  13. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  14. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  15. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  16. deepeval/metrics/dag/schema.py +1 -1
  17. deepeval/metrics/dag/templates.py +2 -2
  18. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  19. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  20. deepeval/metrics/goal_accuracy/schema.py +17 -0
  21. deepeval/metrics/goal_accuracy/template.py +235 -0
  22. deepeval/metrics/hallucination/hallucination.py +8 -8
  23. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  24. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  25. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  26. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  27. deepeval/metrics/plan_adherence/__init__.py +1 -0
  28. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  29. deepeval/metrics/plan_adherence/schema.py +11 -0
  30. deepeval/metrics/plan_adherence/template.py +170 -0
  31. deepeval/metrics/plan_quality/__init__.py +1 -0
  32. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  33. deepeval/metrics/plan_quality/schema.py +11 -0
  34. deepeval/metrics/plan_quality/template.py +101 -0
  35. deepeval/metrics/step_efficiency/__init__.py +1 -0
  36. deepeval/metrics/step_efficiency/schema.py +11 -0
  37. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  38. deepeval/metrics/step_efficiency/template.py +256 -0
  39. deepeval/metrics/task_completion/task_completion.py +1 -0
  40. deepeval/metrics/tool_correctness/schema.py +6 -0
  41. deepeval/metrics/tool_correctness/template.py +88 -0
  42. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  43. deepeval/metrics/tool_use/__init__.py +1 -0
  44. deepeval/metrics/tool_use/schema.py +19 -0
  45. deepeval/metrics/tool_use/template.py +220 -0
  46. deepeval/metrics/tool_use/tool_use.py +458 -0
  47. deepeval/metrics/topic_adherence/__init__.py +1 -0
  48. deepeval/metrics/topic_adherence/schema.py +16 -0
  49. deepeval/metrics/topic_adherence/template.py +162 -0
  50. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  52. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  53. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  54. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  55. deepeval/openai/extractors.py +61 -16
  56. deepeval/openai/patch.py +8 -12
  57. deepeval/openai/types.py +1 -1
  58. deepeval/openai/utils.py +108 -1
  59. deepeval/prompt/prompt.py +1 -0
  60. deepeval/prompt/utils.py +43 -14
  61. deepeval/synthesizer/synthesizer.py +11 -10
  62. deepeval/test_case/llm_test_case.py +6 -2
  63. deepeval/test_run/test_run.py +190 -207
  64. deepeval/tracing/__init__.py +2 -1
  65. deepeval/tracing/otel/exporter.py +3 -4
  66. deepeval/tracing/otel/utils.py +23 -4
  67. deepeval/tracing/trace_context.py +53 -38
  68. deepeval/tracing/tracing.py +23 -0
  69. deepeval/tracing/types.py +16 -14
  70. deepeval/utils.py +21 -0
  71. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  72. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
  73. deepeval/integrations/llama_index/agent/patched.py +0 -68
  74. deepeval/tracing/message_types/__init__.py +0 -10
  75. deepeval/tracing/message_types/base.py +0 -6
  76. deepeval/tracing/message_types/messages.py +0 -14
  77. deepeval/tracing/message_types/tools.py +0 -18
  78. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  79. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  80. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -35,12 +35,10 @@ from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
35
35
  from deepeval.prompt import (
36
36
  PromptMessage,
37
37
  ModelSettings,
38
- OutputType,
39
38
  PromptInterpolationType,
40
39
  OutputType,
41
40
  )
42
41
  from rich.panel import Panel
43
- from rich.text import Text
44
42
  from rich.columns import Columns
45
43
 
46
44
 
@@ -213,65 +211,91 @@ class TestRun(BaseModel):
213
211
  valid_scores = 0
214
212
 
215
213
  def process_metric_data(metric_data: MetricData):
214
+ """
215
+ Process and aggregate metric data for overall test metrics.
216
+
217
+ Args:
218
+ metric_data: The metric data to process
219
+ """
216
220
  nonlocal valid_scores
217
- name = metric_data.name
221
+ metric_name = metric_data.name
218
222
  score = metric_data.score
219
223
  success = metric_data.success
220
- # Initialize dict entry if needed.
221
- if name not in metrics_dict:
222
- metrics_dict[name] = {
224
+
225
+ if metric_name not in metrics_dict:
226
+ metrics_dict[metric_name] = {
223
227
  "scores": [],
224
228
  "passes": 0,
225
229
  "fails": 0,
226
230
  "errors": 0,
227
231
  }
228
232
 
233
+ metric_dict = metrics_dict[metric_name]
234
+
229
235
  if score is None or success is None:
230
- metrics_dict[name]["errors"] += 1
236
+ metric_dict["errors"] += 1
231
237
  else:
232
238
  valid_scores += 1
233
-
234
- # Append the score.
235
- metrics_dict[name]["scores"].append(score)
236
-
237
- # Increment passes or fails based on the metric_data.success flag.
239
+ metric_dict["scores"].append(score)
238
240
  if success:
239
- metrics_dict[name]["passes"] += 1
241
+ metric_dict["passes"] += 1
240
242
  else:
241
- metrics_dict[name]["fails"] += 1
243
+ metric_dict["fails"] += 1
242
244
 
243
245
  def process_span_metric_data(
244
- metric_data: MetricData, type: span_api_type_literals, name: str
246
+ metric_data: MetricData,
247
+ span_type: span_api_type_literals,
248
+ span_name: str,
245
249
  ):
250
+ """
251
+ Process and aggregate metric data for a specific span.
252
+
253
+ Args:
254
+ metric_data: The metric data to process
255
+ span_type: The type of span (agent, tool, retriever, llm, base)
256
+ span_name: The name of the span
257
+ """
246
258
  metric_name = metric_data.name
247
259
  score = metric_data.score
248
260
  success = metric_data.success
249
261
 
250
- # Initialize the structure if needed
251
- if name not in trace_metrics_dict[type]:
252
- trace_metrics_dict[type][name] = {}
262
+ if span_name not in trace_metrics_dict[span_type]:
263
+ trace_metrics_dict[span_type][span_name] = {}
253
264
 
254
- if metric_name not in trace_metrics_dict[type][name]:
255
- trace_metrics_dict[type][name][metric_name] = {
265
+ if metric_name not in trace_metrics_dict[span_type][span_name]:
266
+ trace_metrics_dict[span_type][span_name][metric_name] = {
256
267
  "scores": [],
257
268
  "passes": 0,
258
269
  "fails": 0,
259
270
  "errors": 0,
260
271
  }
261
272
 
273
+ metric_dict = trace_metrics_dict[span_type][span_name][metric_name]
274
+
262
275
  if score is None or success is None:
263
- trace_metrics_dict[type][name][metric_name]["errors"] += 1
276
+ metric_dict["errors"] += 1
264
277
  else:
265
- # Append the score
266
- trace_metrics_dict[type][name][metric_name]["scores"].append(
267
- score
268
- )
269
-
270
- # Increment passes or fails
278
+ metric_dict["scores"].append(score)
271
279
  if success:
272
- trace_metrics_dict[type][name][metric_name]["passes"] += 1
280
+ metric_dict["passes"] += 1
273
281
  else:
274
- trace_metrics_dict[type][name][metric_name]["fails"] += 1
282
+ metric_dict["fails"] += 1
283
+
284
+ def process_spans(spans, span_type: span_api_type_literals):
285
+ """
286
+ Process all metrics for a list of spans of a specific type.
287
+
288
+ Args:
289
+ spans: List of spans to process
290
+ span_type: The type of spans being processed
291
+ """
292
+ for span in spans:
293
+ if span.metrics_data is not None:
294
+ for metric_data in span.metrics_data:
295
+ process_metric_data(metric_data)
296
+ process_span_metric_data(
297
+ metric_data, span_type, span.name
298
+ )
275
299
 
276
300
  # Process non-conversational test cases.
277
301
  for test_case in self.test_cases:
@@ -283,45 +307,14 @@ class TestRun(BaseModel):
283
307
  if test_case.trace is None:
284
308
  continue
285
309
 
286
- for span in test_case.trace.agent_spans:
287
- if span.metrics_data is not None:
288
- for metric_data in span.metrics_data:
289
- process_metric_data(metric_data)
290
- process_span_metric_data(
291
- metric_data, SpanApiType.AGENT.value, span.name
292
- )
293
-
294
- for span in test_case.trace.tool_spans:
295
- if span.metrics_data is not None:
296
- for metric_data in span.metrics_data:
297
- process_metric_data(metric_data)
298
- process_span_metric_data(
299
- metric_data, SpanApiType.TOOL.value, span.name
300
- )
301
-
302
- for span in test_case.trace.retriever_spans:
303
- if span.metrics_data is not None:
304
- for metric_data in span.metrics_data:
305
- process_metric_data(metric_data)
306
- process_span_metric_data(
307
- metric_data, SpanApiType.RETRIEVER.value, span.name
308
- )
309
-
310
- for span in test_case.trace.llm_spans:
311
- if span.metrics_data is not None:
312
- for metric_data in span.metrics_data:
313
- process_metric_data(metric_data)
314
- process_span_metric_data(
315
- metric_data, SpanApiType.LLM.value, span.name
316
- )
317
-
318
- for span in test_case.trace.base_spans:
319
- if span.metrics_data is not None:
320
- for metric_data in span.metrics_data:
321
- process_metric_data(metric_data)
322
- process_span_metric_data(
323
- metric_data, SpanApiType.BASE.value, span.name
324
- )
310
+ # Process all span types using the helper function
311
+ process_spans(test_case.trace.agent_spans, SpanApiType.AGENT.value)
312
+ process_spans(test_case.trace.tool_spans, SpanApiType.TOOL.value)
313
+ process_spans(
314
+ test_case.trace.retriever_spans, SpanApiType.RETRIEVER.value
315
+ )
316
+ process_spans(test_case.trace.llm_spans, SpanApiType.LLM.value)
317
+ process_spans(test_case.trace.base_spans, SpanApiType.BASE.value)
325
318
 
326
319
  # Process conversational test cases.
327
320
  for convo_test_case in self.conversational_test_cases:
@@ -554,105 +547,141 @@ class TestRunManager:
554
547
  def clear_test_run(self):
555
548
  self.test_run = None
556
549
 
557
- def display_results_table(
558
- self, test_run: TestRun, display: TestRunResultDisplay
559
- ):
560
- table = Table(title="Test Results")
561
- table.add_column("Test case", justify="left")
562
- table.add_column("Metric", justify="left")
563
- table.add_column("Score", justify="left")
564
- table.add_column("Status", justify="left")
565
- table.add_column("Overall Success Rate", justify="left")
566
-
567
- for index, test_case in enumerate(test_run.test_cases):
568
- if test_case.metrics_data is None:
569
- continue
550
+ @staticmethod
551
+ def _calculate_success_rate(pass_count: int, fail_count: int) -> str:
552
+ """Calculate success rate percentage or return error message."""
553
+ total = pass_count + fail_count
554
+ if total > 0:
555
+ return str(round((100 * pass_count) / total, 2))
556
+ return "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
557
+
558
+ @staticmethod
559
+ def _get_metric_status(metric_data: MetricData) -> str:
560
+ """Get formatted status string for a metric."""
561
+ if metric_data.error:
562
+ return "[red]ERRORED[/red]"
563
+ elif metric_data.success:
564
+ return "[green]PASSED[/green]"
565
+ return "[red]FAILED[/red]"
566
+
567
+ @staticmethod
568
+ def _format_metric_score(metric_data: MetricData) -> str:
569
+ """Format metric score with evaluation details."""
570
+ evaluation_model = metric_data.evaluation_model or "n/a"
571
+ metric_score = (
572
+ round(metric_data.score, 2)
573
+ if metric_data.score is not None
574
+ else None
575
+ )
570
576
 
571
- if (
572
- display == TestRunResultDisplay.PASSING
573
- and test_case.success is False
574
- ):
575
- continue
576
- elif display == TestRunResultDisplay.FAILING and test_case.success:
577
- continue
577
+ return (
578
+ f"{metric_score} "
579
+ f"(threshold={metric_data.threshold}, "
580
+ f"evaluation model={evaluation_model}, "
581
+ f"reason={metric_data.reason}, "
582
+ f"error={metric_data.error})"
583
+ )
578
584
 
579
- pass_count = 0
580
- fail_count = 0
581
- test_case_name = test_case.name
585
+ @staticmethod
586
+ def _should_skip_test_case(
587
+ test_case, display: TestRunResultDisplay
588
+ ) -> bool:
589
+ """Determine if test case should be skipped based on display filter."""
590
+ if display == TestRunResultDisplay.PASSING and not test_case.success:
591
+ return True
592
+ elif display == TestRunResultDisplay.FAILING and test_case.success:
593
+ return True
594
+ return False
595
+
596
+ @staticmethod
597
+ def _count_metric_results(
598
+ metrics_data: List[MetricData],
599
+ ) -> tuple[int, int]:
600
+ """Count passing and failing metrics."""
601
+ pass_count = 0
602
+ fail_count = 0
603
+ for metric_data in metrics_data:
604
+ if metric_data.success:
605
+ pass_count += 1
606
+ else:
607
+ fail_count += 1
608
+ return pass_count, fail_count
582
609
 
583
- # TODO: recursively iterate through it to calculate pass and fail count
584
- if test_case.trace:
585
- pass
610
+ def _add_test_case_header_row(
611
+ self,
612
+ table: Table,
613
+ test_case_name: str,
614
+ pass_count: int,
615
+ fail_count: int,
616
+ ):
617
+ """Add test case header row with name and success rate."""
618
+ success_rate = self._calculate_success_rate(pass_count, fail_count)
619
+ table.add_row(
620
+ test_case_name,
621
+ *[""] * 3,
622
+ f"{success_rate}%",
623
+ )
586
624
 
587
- for metric_data in test_case.metrics_data:
588
- if metric_data.success:
589
- pass_count += 1
590
- else:
591
- fail_count += 1
625
+ def _add_metric_rows(self, table: Table, metrics_data: List[MetricData]):
626
+ """Add metric detail rows to the table."""
627
+ for metric_data in metrics_data:
628
+ status = self._get_metric_status(metric_data)
629
+ formatted_score = self._format_metric_score(metric_data)
592
630
 
593
- success_rate = (
594
- round((100 * pass_count) / (pass_count + fail_count), 2)
595
- if pass_count + fail_count > 0
596
- else "Cannot display metrics for component-level evals, please run 'deepeval view' to see results on Confident AI."
597
- )
598
631
  table.add_row(
599
- test_case_name,
600
632
  "",
633
+ str(metric_data.name),
634
+ formatted_score,
635
+ status,
601
636
  "",
602
- "",
603
- f"{success_rate}%",
604
637
  )
605
638
 
606
- for metric_data in test_case.metrics_data:
607
- if metric_data.error:
608
- status = "[red]ERRORED[/red]"
609
- elif metric_data.success:
610
- status = "[green]PASSED[/green]"
611
- else:
612
- status = "[red]FAILED[/red]"
639
+ def _add_separator_row(self, table: Table):
640
+ """Add empty separator row between test cases."""
641
+ table.add_row(*[""] * len(table.columns))
613
642
 
614
- evaluation_model = metric_data.evaluation_model
615
- if evaluation_model is None:
616
- evaluation_model = "n/a"
643
+ def display_results_table(
644
+ self, test_run: TestRun, display: TestRunResultDisplay
645
+ ):
646
+ """Display test results in a formatted table."""
617
647
 
618
- if metric_data.score is not None:
619
- metric_score = round(metric_data.score, 2)
620
- else:
621
- metric_score = None
622
-
623
- table.add_row(
624
- "",
625
- str(metric_data.name),
626
- f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
627
- status,
628
- "",
629
- )
648
+ table = Table(title="Test Results")
649
+ column_config = dict(justify="left")
650
+ column_names = [
651
+ "Test case",
652
+ "Metric",
653
+ "Score",
654
+ "Status",
655
+ "Overall Success Rate",
656
+ ]
630
657
 
631
- if index is not len(self.test_run.test_cases) - 1:
632
- table.add_row(
633
- "",
634
- "",
635
- "",
636
- "",
637
- "",
638
- )
658
+ for name in column_names:
659
+ table.add_column(name, **column_config)
639
660
 
661
+ # Process regular test cases
662
+ for index, test_case in enumerate(test_run.test_cases):
663
+ if test_case.metrics_data is None or self._should_skip_test_case(
664
+ test_case, display
665
+ ):
666
+ continue
667
+ pass_count, fail_count = self._count_metric_results(
668
+ test_case.metrics_data
669
+ )
670
+ self._add_test_case_header_row(
671
+ table, test_case.name, pass_count, fail_count
672
+ )
673
+ self._add_metric_rows(table, test_case.metrics_data)
674
+
675
+ if index < len(test_run.test_cases) - 1:
676
+ self._add_separator_row(table)
677
+
678
+ # Process conversational test cases
640
679
  for index, conversational_test_case in enumerate(
641
680
  test_run.conversational_test_cases
642
681
  ):
643
- if (
644
- display == TestRunResultDisplay.PASSING
645
- and conversational_test_case.success is False
646
- ):
647
- continue
648
- elif (
649
- display == TestRunResultDisplay.FAILING
650
- and conversational_test_case.success
651
- ):
682
+ if self._should_skip_test_case(conversational_test_case, display):
652
683
  continue
653
684
 
654
- pass_count = 0
655
- fail_count = 0
656
685
  conversational_test_case_name = conversational_test_case.name
657
686
 
658
687
  if conversational_test_case.turns:
@@ -713,71 +742,26 @@ class TestRunManager:
713
742
  console.print(
714
743
  f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
715
744
  )
716
-
717
745
  if conversational_test_case.metrics_data is not None:
718
- for metric_data in conversational_test_case.metrics_data:
719
- if metric_data.success:
720
- pass_count += 1
721
- else:
722
- fail_count += 1
723
- table.add_row(
724
- conversational_test_case_name,
725
- "",
726
- "",
727
- "",
728
- f"{round((100*pass_count)/(pass_count+fail_count),2)}%",
746
+ pass_count, fail_count = self._count_metric_results(
747
+ conversational_test_case.metrics_data
729
748
  )
730
-
731
- if conversational_test_case.metrics_data is not None:
732
- for metric_data in conversational_test_case.metrics_data:
733
- if metric_data.error:
734
- status = "[red]ERRORED[/red]"
735
- elif metric_data.success:
736
- status = "[green]PASSED[/green]"
737
- else:
738
- status = "[red]FAILED[/red]"
739
-
740
- evaluation_model = metric_data.evaluation_model
741
- if evaluation_model is None:
742
- evaluation_model = "n/a"
743
-
744
- if metric_data.score is not None:
745
- metric_score = round(metric_data.score, 2)
746
- else:
747
- metric_score = None
748
-
749
- table.add_row(
750
- "",
751
- str(metric_data.name),
752
- f"{metric_score} (threshold={metric_data.threshold}, evaluation model={evaluation_model}, reason={metric_data.reason}, error={metric_data.error})",
753
- status,
754
- "",
755
- )
756
-
757
- if index is not len(self.test_run.conversational_test_cases) - 1:
758
- table.add_row(
759
- "",
760
- "",
761
- "",
762
- "",
763
- "",
749
+ self._add_test_case_header_row(
750
+ table, conversational_test_case.name, pass_count, fail_count
764
751
  )
765
-
766
- if index is not len(self.test_run.test_cases) - 1:
767
- table.add_row(
768
- "",
769
- "",
770
- "",
771
- "",
772
- "",
752
+ self._add_metric_rows(
753
+ table, conversational_test_case.metrics_data
773
754
  )
774
755
 
756
+ if index < len(test_run.conversational_test_cases) - 1:
757
+ self._add_separator_row(table)
758
+
759
+ if index < len(test_run.test_cases) - 1:
760
+ self._add_separator_row(table)
761
+
775
762
  table.add_row(
776
763
  "[bold red]Note: Use Confident AI with DeepEval to analyze failed test cases for more details[/bold red]",
777
- "",
778
- "",
779
- "",
780
- "",
764
+ *[""] * (len(table.columns) - 1),
781
765
  )
782
766
  print(table)
783
767
 
@@ -970,7 +954,6 @@ class TestRunManager:
970
954
  global_test_run_cache_manager.disable_write_cache = not bool(
971
955
  get_is_running_deepeval()
972
956
  )
973
-
974
957
  global_test_run_cache_manager.wrap_up_cached_test_run()
975
958
 
976
959
  if display_table:
@@ -4,7 +4,7 @@ from .context import (
4
4
  update_retriever_span,
5
5
  update_llm_span,
6
6
  )
7
- from .trace_context import trace
7
+ from .trace_context import trace, LlmSpanContext
8
8
  from .types import BaseSpan, Trace
9
9
  from .tracing import observe, trace_manager
10
10
  from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
@@ -14,6 +14,7 @@ __all__ = [
14
14
  "update_current_trace",
15
15
  "update_retriever_span",
16
16
  "update_llm_span",
17
+ "LlmSpanContext",
17
18
  "BaseSpan",
18
19
  "Trace",
19
20
  "observe",
@@ -254,7 +254,7 @@ class ConfidentSpanExporter(SpanExporter):
254
254
 
255
255
  base_span = None
256
256
  try:
257
- base_span = self.__prepare_boilerplate_base_span(span)
257
+ base_span = self.prepare_boilerplate_base_span(span)
258
258
  except Exception:
259
259
  pass
260
260
 
@@ -453,9 +453,8 @@ class ConfidentSpanExporter(SpanExporter):
453
453
  if span_output:
454
454
  base_span.output = span_output
455
455
 
456
- def __prepare_boilerplate_base_span(
457
- self, span: ReadableSpan
458
- ) -> Optional[BaseSpan]:
456
+ @staticmethod
457
+ def prepare_boilerplate_base_span(span: ReadableSpan) -> Optional[BaseSpan]:
459
458
 
460
459
  ################ Get Span Type ################
461
460
  span_type = span.attributes.get("confident.span.type")
@@ -11,7 +11,7 @@ from deepeval.tracing import trace_manager, BaseSpan
11
11
  from deepeval.tracing.utils import make_json_serializable
12
12
 
13
13
 
14
- GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "task_completion"]
14
+ GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "text_completion"]
15
15
 
16
16
 
17
17
  def to_hex_string(id_value: int | bytes, length: int = 32) -> str:
@@ -128,6 +128,10 @@ def check_llm_input_from_gen_ai_attributes(
128
128
 
129
129
  input = system_instructions + input_messages
130
130
 
131
+ model_parameters = check_model_parameters(span)
132
+ if model_parameters:
133
+ input.append(model_parameters)
134
+
131
135
  except Exception:
132
136
  pass
133
137
  try:
@@ -413,7 +417,7 @@ def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
413
417
  # return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
414
418
 
415
419
 
416
- def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
420
+ def normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
417
421
  try:
418
422
  raw = span.attributes.get("pydantic_ai.all_messages")
419
423
  if not raw:
@@ -438,7 +442,7 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
438
442
  except Exception:
439
443
  pass
440
444
 
441
- return None
445
+ return []
442
446
 
443
447
 
444
448
  def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
@@ -461,7 +465,7 @@ def check_pydantic_ai_agent_input_output(
461
465
  output_val: Optional[Any] = None
462
466
 
463
467
  # Get normalized messages once
464
- normalized = _normalize_pydantic_ai_messages(span)
468
+ normalized = normalize_pydantic_ai_messages(span)
465
469
 
466
470
  # Input (pydantic_ai.all_messages) - slice up to and including the first 'user' message
467
471
  if normalized:
@@ -523,3 +527,18 @@ def check_pydantic_ai_trace_input_output(
523
527
  input_val, output_val = check_pydantic_ai_agent_input_output(span)
524
528
 
525
529
  return input_val, output_val
530
+
531
+
532
+ def check_model_parameters(span: ReadableSpan) -> Optional[dict]:
533
+ try:
534
+ raw_model_parameters = span.attributes.get("model_request_parameters")
535
+ if raw_model_parameters and isinstance(raw_model_parameters, str):
536
+ model_parameters = json.loads(raw_model_parameters)
537
+ if isinstance(model_parameters, dict):
538
+ return {
539
+ "role": "Model Request Parameters",
540
+ "content": model_parameters,
541
+ }
542
+ except Exception:
543
+ pass
544
+ return None