deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  4. deepeval/cli/main.py +42 -0
  5. deepeval/confident/api.py +1 -0
  6. deepeval/config/logging.py +33 -0
  7. deepeval/config/settings.py +176 -16
  8. deepeval/constants.py +8 -1
  9. deepeval/dataset/dataset.py +2 -11
  10. deepeval/dataset/utils.py +1 -1
  11. deepeval/evaluate/evaluate.py +5 -1
  12. deepeval/evaluate/execute.py +118 -60
  13. deepeval/evaluate/utils.py +20 -116
  14. deepeval/integrations/crewai/__init__.py +6 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/wrapper.py +45 -5
  18. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  19. deepeval/metrics/api.py +281 -0
  20. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  21. deepeval/metrics/bias/bias.py +12 -3
  22. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  23. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  24. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  25. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  26. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  27. deepeval/metrics/conversational_dag/nodes.py +12 -4
  28. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  29. deepeval/metrics/dag/dag.py +12 -0
  30. deepeval/metrics/dag/nodes.py +12 -4
  31. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  32. deepeval/metrics/g_eval/g_eval.py +37 -15
  33. deepeval/metrics/hallucination/hallucination.py +12 -1
  34. deepeval/metrics/indicator.py +8 -2
  35. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  36. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  37. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  38. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  39. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  40. deepeval/metrics/misuse/misuse.py +12 -1
  41. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  43. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  44. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  45. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  47. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  50. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  51. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  52. deepeval/metrics/non_advice/non_advice.py +12 -0
  53. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  54. deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
  55. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  56. deepeval/metrics/role_violation/role_violation.py +12 -0
  57. deepeval/metrics/summarization/summarization.py +12 -1
  58. deepeval/metrics/task_completion/task_completion.py +3 -0
  59. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  60. deepeval/metrics/toxicity/toxicity.py +12 -0
  61. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  62. deepeval/models/llms/grok_model.py +1 -1
  63. deepeval/models/llms/openai_model.py +2 -0
  64. deepeval/models/retry_policy.py +202 -11
  65. deepeval/openai/__init__.py +14 -32
  66. deepeval/openai/extractors.py +24 -34
  67. deepeval/openai/patch.py +256 -161
  68. deepeval/openai/types.py +20 -0
  69. deepeval/openai/utils.py +98 -56
  70. deepeval/prompt/__init__.py +19 -1
  71. deepeval/prompt/api.py +160 -0
  72. deepeval/prompt/prompt.py +244 -62
  73. deepeval/prompt/utils.py +144 -2
  74. deepeval/synthesizer/chunking/context_generator.py +209 -152
  75. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  76. deepeval/synthesizer/synthesizer.py +8 -5
  77. deepeval/test_case/api.py +131 -0
  78. deepeval/test_run/__init__.py +1 -0
  79. deepeval/test_run/hyperparameters.py +47 -8
  80. deepeval/test_run/test_run.py +104 -1
  81. deepeval/tracing/api.py +3 -1
  82. deepeval/tracing/message_types/__init__.py +10 -0
  83. deepeval/tracing/message_types/base.py +6 -0
  84. deepeval/tracing/message_types/messages.py +14 -0
  85. deepeval/tracing/message_types/tools.py +18 -0
  86. deepeval/tracing/otel/exporter.py +0 -6
  87. deepeval/tracing/otel/utils.py +58 -8
  88. deepeval/tracing/trace_context.py +73 -4
  89. deepeval/tracing/trace_test_manager.py +19 -0
  90. deepeval/tracing/tracing.py +52 -4
  91. deepeval/tracing/types.py +16 -0
  92. deepeval/tracing/utils.py +8 -0
  93. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  94. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
  95. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  96. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  97. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,131 @@
1
+ from typing import Union, Optional
2
+ import os
3
+
4
+ from deepeval.test_run.api import (
5
+ LLMApiTestCase,
6
+ ConversationalApiTestCase,
7
+ TurnApi,
8
+ TraceApi,
9
+ )
10
+ from deepeval.test_case import (
11
+ LLMTestCase,
12
+ ConversationalTestCase,
13
+ MLLMTestCase,
14
+ Turn,
15
+ )
16
+ from deepeval.constants import PYTEST_RUN_TEST_NAME
17
+
18
+
19
+ def create_api_turn(turn: Turn, index: int) -> TurnApi:
20
+ return TurnApi(
21
+ role=turn.role,
22
+ content=turn.content,
23
+ user_id=turn.user_id,
24
+ retrievalContext=turn.retrieval_context,
25
+ toolsCalled=turn.tools_called,
26
+ additionalMetadata=turn.additional_metadata,
27
+ order=index,
28
+ )
29
+
30
+
31
+ def create_api_test_case(
32
+ test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
33
+ trace: Optional[TraceApi] = None,
34
+ index: Optional[int] = None,
35
+ ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
36
+ if isinstance(test_case, ConversationalTestCase):
37
+ order = (
38
+ test_case._dataset_rank
39
+ if test_case._dataset_rank is not None
40
+ else index
41
+ )
42
+ if test_case.name:
43
+ name = test_case.name
44
+ else:
45
+ name = os.getenv(
46
+ PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
47
+ )
48
+
49
+ api_test_case = ConversationalApiTestCase(
50
+ name=name,
51
+ success=True,
52
+ metricsData=[],
53
+ runDuration=0,
54
+ evaluationCost=None,
55
+ order=order,
56
+ scenario=test_case.scenario,
57
+ expectedOutcome=test_case.expected_outcome,
58
+ userDescription=test_case.user_description,
59
+ context=test_case.context,
60
+ tags=test_case.tags,
61
+ comments=test_case.comments,
62
+ additionalMetadata=test_case.additional_metadata,
63
+ )
64
+ api_test_case.turns = [
65
+ create_api_turn(
66
+ turn=turn,
67
+ index=index,
68
+ )
69
+ for index, turn in enumerate(test_case.turns)
70
+ ]
71
+
72
+ return api_test_case
73
+ else:
74
+ order = (
75
+ test_case._dataset_rank
76
+ if test_case._dataset_rank is not None
77
+ else index
78
+ )
79
+
80
+ success = True
81
+ if test_case.name is not None:
82
+ name = test_case.name
83
+ else:
84
+ name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
85
+ metrics_data = []
86
+
87
+ if isinstance(test_case, LLMTestCase):
88
+ api_test_case = LLMApiTestCase(
89
+ name=name,
90
+ input=test_case.input,
91
+ actualOutput=test_case.actual_output,
92
+ expectedOutput=test_case.expected_output,
93
+ context=test_case.context,
94
+ retrievalContext=test_case.retrieval_context,
95
+ toolsCalled=test_case.tools_called,
96
+ expectedTools=test_case.expected_tools,
97
+ tokenCost=test_case.token_cost,
98
+ completionTime=test_case.completion_time,
99
+ tags=test_case.tags,
100
+ success=success,
101
+ metricsData=metrics_data,
102
+ runDuration=None,
103
+ evaluationCost=None,
104
+ order=order,
105
+ additionalMetadata=test_case.additional_metadata,
106
+ comments=test_case.comments,
107
+ trace=trace,
108
+ )
109
+ elif isinstance(test_case, MLLMTestCase):
110
+ api_test_case = LLMApiTestCase(
111
+ name=name,
112
+ input="",
113
+ multimodalInput=test_case.input,
114
+ multimodalActualOutput=test_case.actual_output,
115
+ multimodalExpectedOutput=test_case.expected_output,
116
+ multimodalRetrievalContext=test_case.retrieval_context,
117
+ multimodalContext=test_case.context,
118
+ toolsCalled=test_case.tools_called,
119
+ expectedTools=test_case.expected_tools,
120
+ tokenCost=test_case.token_cost,
121
+ completionTime=test_case.completion_time,
122
+ success=success,
123
+ metricsData=metrics_data,
124
+ runDuration=None,
125
+ evaluationCost=None,
126
+ order=order,
127
+ additionalMetadata=test_case.additional_metadata,
128
+ comments=test_case.comments,
129
+ )
130
+ # llm_test_case_lookup_map[instance_id] = api_test_case
131
+ return api_test_case
@@ -8,6 +8,7 @@ from .test_run import (
8
8
  LLMApiTestCase,
9
9
  ConversationalApiTestCase,
10
10
  TestRunManager,
11
+ PromptData,
11
12
  )
12
13
 
13
14
  from .hooks import on_test_run_end, invoke_test_run_end_hook
@@ -1,13 +1,15 @@
1
- from typing import Union, Dict
2
-
1
+ from typing import Union, Dict, Optional, List
3
2
  from deepeval.test_run import global_test_run_manager
4
3
  from deepeval.prompt import Prompt
5
4
  from deepeval.prompt.api import PromptApi
6
5
  from deepeval.test_run.test_run import TEMP_FILE_PATH
6
+ from deepeval.confident.api import is_confident
7
+ from deepeval.test_run.test_run import PromptData
7
8
 
8
9
 
9
10
  def process_hyperparameters(
10
- hyperparameters,
11
+ hyperparameters: Optional[Dict] = None,
12
+ verbose: bool = True,
11
13
  ) -> Union[Dict[str, Union[str, int, float, PromptApi]], None]:
12
14
  if hyperparameters is None:
13
15
  return None
@@ -16,6 +18,7 @@ def process_hyperparameters(
16
18
  raise TypeError("Hyperparameters must be a dictionary or None")
17
19
 
18
20
  processed_hyperparameters = {}
21
+ prompts_version_id_map = {}
19
22
 
20
23
  for key, value in hyperparameters.items():
21
24
  if not isinstance(key, str):
@@ -30,14 +33,21 @@ def process_hyperparameters(
30
33
  )
31
34
 
32
35
  if isinstance(value, Prompt):
33
- if value._prompt_version_id is not None and value._type is not None:
36
+ prompt_key = f"{value.alias}_{value.version}"
37
+ if value._prompt_version_id is not None and value.type is not None:
34
38
  processed_hyperparameters[key] = PromptApi(
35
39
  id=value._prompt_version_id,
36
- type=value._type,
40
+ type=value.type,
37
41
  )
38
- else:
39
- raise ValueError(
40
- f"Cannot log Prompt where template was not pulled from Confident AI. Please import your prompt on Confident AI to continue."
42
+ elif is_confident():
43
+ if prompt_key not in prompts_version_id_map:
44
+ value.push(_verbose=verbose)
45
+ prompts_version_id_map[prompt_key] = (
46
+ value._prompt_version_id
47
+ )
48
+ processed_hyperparameters[key] = PromptApi(
49
+ id=prompts_version_id_map[prompt_key],
50
+ type=value.type,
41
51
  )
42
52
  else:
43
53
  processed_hyperparameters[key] = str(value)
@@ -64,3 +74,32 @@ def log_hyperparameters(func):
64
74
 
65
75
  # Return the wrapper function to be used as the decorator
66
76
  return wrapper
77
+
78
+
79
+ def process_prompts(
80
+ hyperparameters: Dict[str, Union[str, int, float, Prompt]],
81
+ ) -> List[PromptData]:
82
+ prompts = []
83
+ if not hyperparameters:
84
+ return prompts
85
+ seen_prompts = set()
86
+ prompt_objects = [
87
+ value for value in hyperparameters.values() if isinstance(value, Prompt)
88
+ ]
89
+ for prompt in prompt_objects:
90
+ prompt_version = prompt.version if is_confident() else None
91
+ prompt_key = f"{prompt.alias}_{prompt_version}"
92
+ if prompt_key in seen_prompts:
93
+ continue
94
+ seen_prompts.add(prompt_key)
95
+ prompt_data = PromptData(
96
+ alias=prompt.alias,
97
+ version=prompt_version,
98
+ text_template=prompt.text_template,
99
+ messages_template=prompt.messages_template,
100
+ model_settings=prompt.model_settings,
101
+ output_type=prompt.output_type,
102
+ interpolation_type=prompt.interpolation_type,
103
+ )
104
+ prompts.append(prompt_data)
105
+ return prompts
@@ -32,6 +32,17 @@ from deepeval.utils import (
32
32
  )
33
33
  from deepeval.test_run.cache import global_test_run_cache_manager
34
34
  from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
35
+ from deepeval.prompt import (
36
+ PromptMessage,
37
+ ModelSettings,
38
+ OutputType,
39
+ PromptInterpolationType,
40
+ OutputType,
41
+ )
42
+ from rich.panel import Panel
43
+ from rich.text import Text
44
+ from rich.columns import Columns
45
+
35
46
 
36
47
  TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
37
48
  LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
@@ -71,6 +82,16 @@ class TraceMetricScores(BaseModel):
71
82
  base: Dict[str, Dict[str, MetricScores]] = Field(default_factory=dict)
72
83
 
73
84
 
85
+ class PromptData(BaseModel):
86
+ alias: Optional[str] = None
87
+ version: Optional[str] = None
88
+ text_template: Optional[str] = None
89
+ messages_template: Optional[List[PromptMessage]] = None
90
+ model_settings: Optional[ModelSettings] = None
91
+ output_type: Optional[OutputType] = None
92
+ interpolation_type: Optional[PromptInterpolationType] = None
93
+
94
+
74
95
  class MetricsAverageDict:
75
96
  def __init__(self):
76
97
  self.metric_dict = {}
@@ -123,6 +144,7 @@ class TestRun(BaseModel):
123
144
  )
124
145
  identifier: Optional[str] = None
125
146
  hyperparameters: Optional[Dict[str, Any]] = Field(None)
147
+ prompts: Optional[List[PromptData]] = Field(None)
126
148
  test_passed: Optional[int] = Field(None, alias="testPassed")
127
149
  test_failed: Optional[int] = Field(None, alias="testFailed")
128
150
  run_duration: float = Field(0.0, alias="runDuration")
@@ -799,6 +821,7 @@ class TestRunManager:
799
821
  test_run.test_cases = initial_batch
800
822
 
801
823
  try:
824
+ test_run.prompts = None
802
825
  body = test_run.model_dump(by_alias=True, exclude_none=True)
803
826
  except AttributeError:
804
827
  # Pydantic version below 2.0
@@ -953,6 +976,23 @@ class TestRunManager:
953
976
  if display_table:
954
977
  self.display_results_table(test_run, display)
955
978
 
979
+ if test_run.hyperparameters is None:
980
+ console.print(
981
+ "\n[bold yellow]⚠ WARNING:[/bold yellow] No hyperparameters logged.\n"
982
+ "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log hyperparameters[/link][/bold blue] to attribute prompts and models to your test runs.\n\n"
983
+ + "=" * 80
984
+ )
985
+ else:
986
+ if not test_run.prompts:
987
+ console.print(
988
+ "\n[bold yellow]⚠ WARNING:[/bold yellow] No prompts logged.\n"
989
+ "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log prompts[/link][/bold blue] to evaluate and optimize your prompt templates and models.\n\n"
990
+ + "=" * 80
991
+ )
992
+ else:
993
+ console.print("\n[bold green]✓ Prompts Logged[/bold green]\n")
994
+ self._render_prompts_panels(prompts=test_run.prompts)
995
+
956
996
  self.save_test_run_locally()
957
997
  delete_file_if_exists(self.temp_file_path)
958
998
  if is_confident() and self.disable_request is False:
@@ -967,7 +1007,7 @@ class TestRunManager:
967
1007
  f"» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
968
1008
  f" » Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
969
1009
  "=" * 80,
970
- "\n\n» What to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
1010
+ "\n\n» Want to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
971
1011
  " » Run [bold]'deepeval view'[/bold] to analyze and save testing results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n",
972
1012
  )
973
1013
 
@@ -993,5 +1033,68 @@ class TestRunManager:
993
1033
  pass
994
1034
  return None
995
1035
 
1036
+ def _render_prompts_panels(self, prompts: List[PromptData]) -> None:
1037
+
1038
+ def format_string(
1039
+ v, default="[dim]None[/dim]", color: Optional[str] = None
1040
+ ):
1041
+ formatted_string = str(v) if v not in (None, "", []) else default
1042
+ return (
1043
+ f"{formatted_string}"
1044
+ if color is None or v in (None, "", [])
1045
+ else f"[{color}]{formatted_string}[/]"
1046
+ )
1047
+
1048
+ panels = []
1049
+ for prompt in prompts:
1050
+ lines = []
1051
+ p_type = (
1052
+ "messages"
1053
+ if prompt.messages_template
1054
+ else ("text" if prompt.text_template else "—")
1055
+ )
1056
+ if p_type:
1057
+ lines.append(f"type: {format_string(p_type, color='blue')}")
1058
+ if prompt.output_type:
1059
+ lines.append(
1060
+ f"output_type: {format_string(prompt.output_type, color='blue')}"
1061
+ )
1062
+ if prompt.interpolation_type:
1063
+ lines.append(
1064
+ f"interpolation_type: {format_string(prompt.interpolation_type, color='blue')}"
1065
+ )
1066
+ if prompt.model_settings:
1067
+ ms = prompt.model_settings
1068
+ settings_lines = [
1069
+ "Model Settings:",
1070
+ f" – provider: {format_string(ms.provider, color='green')}",
1071
+ f" – name: {format_string(ms.name, color='green')}",
1072
+ f" – temperature: {format_string(ms.temperature, color='green')}",
1073
+ f" – max_tokens: {format_string(ms.max_tokens, color='green')}",
1074
+ f" – top_p: {format_string(ms.top_p, color='green')}",
1075
+ f" – frequency_penalty: {format_string(ms.frequency_penalty, color='green')}",
1076
+ f" – presence_penalty: {format_string(ms.presence_penalty, color='green')}",
1077
+ f" – stop_sequence: {format_string(ms.stop_sequence, color='green')}",
1078
+ f" – reasoning_effort: {format_string(ms.reasoning_effort, color='green')}",
1079
+ f" – verbosity: {format_string(ms.verbosity, color='green')}",
1080
+ ]
1081
+ lines.append("")
1082
+ lines.extend(settings_lines)
1083
+ title = f"{format_string(prompt.alias)}"
1084
+ if prompt.version:
1085
+ title += f" (v{prompt.version})"
1086
+ body = "\n".join(lines)
1087
+ panel = Panel(
1088
+ body,
1089
+ title=title,
1090
+ title_align="left",
1091
+ expand=False,
1092
+ padding=(1, 6, 1, 2),
1093
+ )
1094
+ panels.append(panel)
1095
+
1096
+ if panels:
1097
+ console.print(Columns(panels, equal=False, expand=False))
1098
+
996
1099
 
997
1100
  global_test_run_manager = TestRunManager()
deepeval/tracing/api.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from enum import Enum
2
2
  from typing import Dict, List, Optional, Union, Literal, Any
3
- from pydantic import BaseModel, Field
3
+ from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
5
  from deepeval.test_case import ToolCall
6
6
 
@@ -27,6 +27,8 @@ class PromptApi(BaseModel):
27
27
 
28
28
 
29
29
  class MetricData(BaseModel):
30
+ model_config = ConfigDict(extra="ignore")
31
+
30
32
  name: str
31
33
  threshold: float
32
34
  success: bool
@@ -0,0 +1,10 @@
1
+ from .messages import TextMessage, ToolCallMessage
2
+ from .tools import BaseTool, ToolSchema, ToolOutput
3
+
4
+ __all__ = [
5
+ "BaseTool",
6
+ "TextMessage",
7
+ "ToolCallMessage",
8
+ "ToolSchema",
9
+ "ToolOutput",
10
+ ]
@@ -0,0 +1,6 @@
1
+ from typing import Literal
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class BaseMessage(BaseModel):
6
+ role: Literal["user", "assistant"]
@@ -0,0 +1,14 @@
1
+ from typing import Literal, Dict, Any
2
+ from .base import BaseMessage
3
+
4
+
5
+ class TextMessage(BaseMessage):
6
+ type: Literal["text", "thinking"]
7
+ content: str
8
+
9
+
10
+ class ToolCallMessage(BaseMessage):
11
+ """This is a message for tool calls in response.choices[0].message.tool_calls"""
12
+
13
+ name: str
14
+ args: Dict[str, Any]
@@ -0,0 +1,18 @@
1
+ from typing import Any, Optional, Dict
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class BaseTool(BaseModel):
6
+ name: str
7
+ description: Optional[str] = None
8
+
9
+
10
+ class ToolSchema(BaseTool):
11
+ parameters: Dict[str, Any]
12
+ is_called: Optional[bool] = False
13
+
14
+
15
+ class ToolOutput(BaseTool):
16
+ """Output of the tool function"""
17
+
18
+ output: Any
@@ -90,12 +90,6 @@ class ConfidentSpanExporter(SpanExporter):
90
90
  api_key: Optional[str] = None, # dynamic api key,
91
91
  _test_run_id: Optional[str] = None,
92
92
  ) -> SpanExportResult:
93
- # build forest of spans
94
- # for span in spans:
95
- # print("--------------------------------")
96
- # print(span.to_json())
97
- # print("--------------------------------")
98
- # return SpanExportResult.SUCCESS
99
93
 
100
94
  ################ Build Forest of Spans ################
101
95
  forest = self._build_span_forest(spans)
@@ -3,7 +3,7 @@ import json
3
3
  from typing import List, Optional, Tuple, Any
4
4
  from opentelemetry.sdk.trace.export import ReadableSpan
5
5
 
6
- from deepeval.evaluate.utils import create_api_test_case
6
+ from deepeval.test_case.api import create_api_test_case
7
7
  from deepeval.test_run.api import LLMApiTestCase
8
8
  from deepeval.test_run.test_run import global_test_run_manager
9
9
  from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
@@ -109,8 +109,24 @@ def check_llm_input_from_gen_ai_attributes(
109
109
  input = None
110
110
  output = None
111
111
  try:
112
- input = json.loads(span.attributes.get("gen_ai.input.messages"))
113
- input = _flatten_input(input)
112
+ # check for system instructions
113
+ system_instructions = []
114
+ system_instructions_raw = span.attributes.get(
115
+ "gen_ai.system_instructions"
116
+ )
117
+ if system_instructions_raw and isinstance(system_instructions_raw, str):
118
+ system_instructions_json = json.loads(system_instructions_raw)
119
+ system_instructions = _flatten_system_instructions(
120
+ system_instructions_json
121
+ )
122
+
123
+ input_messages = []
124
+ input_messages_raw = span.attributes.get("gen_ai.input.messages")
125
+ if input_messages_raw and isinstance(input_messages_raw, str):
126
+ input_messages_json = json.loads(input_messages_raw)
127
+ input_messages = _flatten_input(input_messages_json)
128
+
129
+ input = system_instructions + input_messages
114
130
 
115
131
  except Exception:
116
132
  pass
@@ -137,6 +153,20 @@ def check_llm_input_from_gen_ai_attributes(
137
153
  return input, output
138
154
 
139
155
 
156
+ def _flatten_system_instructions(system_instructions: list) -> list:
157
+ if isinstance(system_instructions, list):
158
+ for system_instruction in system_instructions:
159
+ if isinstance(system_instruction, dict):
160
+ role = system_instruction.get("role")
161
+ if not role:
162
+ system_instruction["role"] = "System Instruction"
163
+ return _flatten_input(system_instructions)
164
+ elif isinstance(system_instructions, str):
165
+ return [{"role": "System Instruction", "content": system_instructions}]
166
+
167
+ return []
168
+
169
+
140
170
  def _flatten_input(input: list) -> list:
141
171
  if input and isinstance(input, list):
142
172
  try:
@@ -411,10 +441,23 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
411
441
  return None
412
442
 
413
443
 
444
+ def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
445
+
446
+ if isinstance(message, dict) and message.get("role") == "assistant":
447
+ parts = message.get("parts")
448
+ if parts:
449
+ # Iterate from the last part
450
+ for part in reversed(parts):
451
+ if isinstance(part, dict) and part.get("type") == "text":
452
+ # Return a modified message with only the text content
453
+ return {"role": "assistant", "content": part.get("content")}
454
+ return None
455
+
456
+
414
457
  def check_pydantic_ai_agent_input_output(
415
458
  span: ReadableSpan,
416
459
  ) -> Tuple[Optional[Any], Optional[Any]]:
417
- input_val: Optional[Any] = None
460
+ input_val: list = []
418
461
  output_val: Optional[Any] = None
419
462
 
420
463
  # Get normalized messages once
@@ -445,14 +488,21 @@ def check_pydantic_ai_agent_input_output(
445
488
  if span.attributes.get("confident.span.type") == "agent":
446
489
  output_val = span.attributes.get("final_result")
447
490
  if not output_val and normalized:
448
- # Extract the last message if no final_result is available
449
- output_val = normalized[-1]
491
+ output_val = _extract_non_thinking_part_of_last_message(
492
+ normalized[-1]
493
+ )
450
494
  except Exception:
451
495
  pass
452
496
 
497
+ system_instructions = []
498
+ system_instruction_raw = span.attributes.get("gen_ai.system_instructions")
499
+ if system_instruction_raw and isinstance(system_instruction_raw, str):
500
+ system_instructions = _flatten_system_instructions(
501
+ json.loads(system_instruction_raw)
502
+ )
503
+
453
504
  input_val = _flatten_input(input_val)
454
- output_val = _flatten_input(output_val)
455
- return input_val, output_val
505
+ return system_instructions + input_val, output_val
456
506
 
457
507
 
458
508
  def check_tool_output(span: ReadableSpan):
@@ -1,14 +1,83 @@
1
- from .context import current_trace_context
2
- from .tracing import trace_manager
1
+ from typing import Optional, List, Dict, Any
2
+ from contextvars import ContextVar
3
3
  from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+
6
+ from .tracing import trace_manager
7
+ from .context import current_trace_context, update_current_trace
8
+ from deepeval.prompt import Prompt
9
+ from deepeval.metrics import BaseMetric
10
+ from deepeval.test_case.llm_test_case import ToolCall
11
+
12
+
13
+ @dataclass
14
+ class LlmContext:
15
+ prompt: Optional[Prompt] = None
16
+ metrics: Optional[List[BaseMetric]] = None
17
+ metric_collection: Optional[str] = None
18
+ expected_output: Optional[str] = None
19
+ expected_tools: Optional[List[ToolCall]] = None
20
+ context: Optional[List[str]] = None
21
+ retrieval_context: Optional[List[str]] = None
22
+
23
+
24
+ current_llm_context: ContextVar[Optional[LlmContext]] = ContextVar(
25
+ "current_llm_context", default=LlmContext()
26
+ )
4
27
 
5
28
 
6
29
  @contextmanager
7
- def trace():
30
+ def trace(
31
+ prompt: Optional[Prompt] = None,
32
+ llm_metrics: Optional[List[BaseMetric]] = None,
33
+ llm_metric_collection: Optional[str] = None,
34
+ name: Optional[str] = None,
35
+ tags: Optional[List[str]] = None,
36
+ metadata: Optional[Dict[str, Any]] = None,
37
+ user_id: Optional[str] = None,
38
+ thread_id: Optional[str] = None,
39
+ expected_output: Optional[str] = None,
40
+ expected_tools: Optional[List[ToolCall]] = None,
41
+ context: Optional[List[str]] = None,
42
+ retrieval_context: Optional[List[str]] = None,
43
+ trace_metric_collection: Optional[str] = None,
44
+ trace_metrics: Optional[List[BaseMetric]] = None,
45
+ ):
8
46
  current_trace = current_trace_context.get()
9
47
 
10
48
  if not current_trace:
11
49
  current_trace = trace_manager.start_new_trace()
12
- current_trace_context.set(current_trace)
50
+
51
+ if trace_metrics:
52
+ current_trace.metrics = trace_metrics
53
+
54
+ if trace_metric_collection:
55
+ current_trace.metric_collection = trace_metric_collection
56
+
57
+ current_trace_context.set(current_trace)
58
+
59
+ current_llm_context.set(
60
+ LlmContext(
61
+ prompt=prompt,
62
+ metrics=llm_metrics,
63
+ metric_collection=llm_metric_collection,
64
+ expected_output=expected_output,
65
+ expected_tools=expected_tools,
66
+ context=context,
67
+ retrieval_context=retrieval_context,
68
+ )
69
+ )
70
+
71
+ # set the current trace attributes
72
+ if name:
73
+ update_current_trace(name=name)
74
+ if tags:
75
+ update_current_trace(tags=tags)
76
+ if metadata:
77
+ update_current_trace(metadata=metadata)
78
+ if user_id:
79
+ update_current_trace(user_id=user_id)
80
+ if thread_id:
81
+ update_current_trace(thread_id=thread_id)
13
82
 
14
83
  yield current_trace
@@ -0,0 +1,19 @@
1
+ from typing import Optional, Dict, Any
2
+ import asyncio
3
+ from time import monotonic
4
+
5
+
6
+ class TraceTestingManager:
7
+ test_name: Optional[str] = None
8
+ test_dict: Optional[Dict[str, Any]] = None
9
+
10
+ async def wait_for_test_dict(
11
+ self, timeout: float = 10.0, poll_interval: float = 0.05
12
+ ) -> Dict[str, Any]:
13
+ deadline = monotonic() + timeout
14
+ while self.test_dict is None and monotonic() < deadline:
15
+ await asyncio.sleep(poll_interval)
16
+ return self.test_dict or {}
17
+
18
+
19
+ trace_testing_manager = TraceTestingManager()