deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,131 @@
1
+ from typing import Union, Optional
2
+ import os
3
+
4
+ from deepeval.test_run.api import (
5
+ LLMApiTestCase,
6
+ ConversationalApiTestCase,
7
+ TurnApi,
8
+ TraceApi,
9
+ )
10
+ from deepeval.test_case import (
11
+ LLMTestCase,
12
+ ConversationalTestCase,
13
+ MLLMTestCase,
14
+ Turn,
15
+ )
16
+ from deepeval.constants import PYTEST_RUN_TEST_NAME
17
+
18
+
19
+ def create_api_turn(turn: Turn, index: int) -> TurnApi:
20
+ return TurnApi(
21
+ role=turn.role,
22
+ content=turn.content,
23
+ user_id=turn.user_id,
24
+ retrievalContext=turn.retrieval_context,
25
+ toolsCalled=turn.tools_called,
26
+ additionalMetadata=turn.additional_metadata,
27
+ order=index,
28
+ )
29
+
30
+
31
+ def create_api_test_case(
32
+ test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
33
+ trace: Optional[TraceApi] = None,
34
+ index: Optional[int] = None,
35
+ ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
36
+ if isinstance(test_case, ConversationalTestCase):
37
+ order = (
38
+ test_case._dataset_rank
39
+ if test_case._dataset_rank is not None
40
+ else index
41
+ )
42
+ if test_case.name:
43
+ name = test_case.name
44
+ else:
45
+ name = os.getenv(
46
+ PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
47
+ )
48
+
49
+ api_test_case = ConversationalApiTestCase(
50
+ name=name,
51
+ success=True,
52
+ metricsData=[],
53
+ runDuration=0,
54
+ evaluationCost=None,
55
+ order=order,
56
+ scenario=test_case.scenario,
57
+ expectedOutcome=test_case.expected_outcome,
58
+ userDescription=test_case.user_description,
59
+ context=test_case.context,
60
+ tags=test_case.tags,
61
+ comments=test_case.comments,
62
+ additionalMetadata=test_case.additional_metadata,
63
+ )
64
+ api_test_case.turns = [
65
+ create_api_turn(
66
+ turn=turn,
67
+ index=index,
68
+ )
69
+ for index, turn in enumerate(test_case.turns)
70
+ ]
71
+
72
+ return api_test_case
73
+ else:
74
+ order = (
75
+ test_case._dataset_rank
76
+ if test_case._dataset_rank is not None
77
+ else index
78
+ )
79
+
80
+ success = True
81
+ if test_case.name is not None:
82
+ name = test_case.name
83
+ else:
84
+ name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
85
+ metrics_data = []
86
+
87
+ if isinstance(test_case, LLMTestCase):
88
+ api_test_case = LLMApiTestCase(
89
+ name=name,
90
+ input=test_case.input,
91
+ actualOutput=test_case.actual_output,
92
+ expectedOutput=test_case.expected_output,
93
+ context=test_case.context,
94
+ retrievalContext=test_case.retrieval_context,
95
+ toolsCalled=test_case.tools_called,
96
+ expectedTools=test_case.expected_tools,
97
+ tokenCost=test_case.token_cost,
98
+ completionTime=test_case.completion_time,
99
+ tags=test_case.tags,
100
+ success=success,
101
+ metricsData=metrics_data,
102
+ runDuration=None,
103
+ evaluationCost=None,
104
+ order=order,
105
+ additionalMetadata=test_case.additional_metadata,
106
+ comments=test_case.comments,
107
+ trace=trace,
108
+ )
109
+ elif isinstance(test_case, MLLMTestCase):
110
+ api_test_case = LLMApiTestCase(
111
+ name=name,
112
+ input="",
113
+ multimodalInput=test_case.input,
114
+ multimodalActualOutput=test_case.actual_output,
115
+ multimodalExpectedOutput=test_case.expected_output,
116
+ multimodalRetrievalContext=test_case.retrieval_context,
117
+ multimodalContext=test_case.context,
118
+ toolsCalled=test_case.tools_called,
119
+ expectedTools=test_case.expected_tools,
120
+ tokenCost=test_case.token_cost,
121
+ completionTime=test_case.completion_time,
122
+ success=success,
123
+ metricsData=metrics_data,
124
+ runDuration=None,
125
+ evaluationCost=None,
126
+ order=order,
127
+ additionalMetadata=test_case.additional_metadata,
128
+ comments=test_case.comments,
129
+ )
130
+ # llm_test_case_lookup_map[instance_id] = api_test_case
131
+ return api_test_case
@@ -8,6 +8,7 @@ from .test_run import (
8
8
  LLMApiTestCase,
9
9
  ConversationalApiTestCase,
10
10
  TestRunManager,
11
+ PromptData,
11
12
  )
12
13
 
13
14
  from .hooks import on_test_run_end, invoke_test_run_end_hook
@@ -1,13 +1,15 @@
1
- from typing import Union, Dict
2
-
1
+ from typing import Union, Dict, Optional, List
3
2
  from deepeval.test_run import global_test_run_manager
4
3
  from deepeval.prompt import Prompt
5
4
  from deepeval.prompt.api import PromptApi
6
5
  from deepeval.test_run.test_run import TEMP_FILE_PATH
6
+ from deepeval.confident.api import is_confident
7
+ from deepeval.test_run.test_run import PromptData
7
8
 
8
9
 
9
10
  def process_hyperparameters(
10
- hyperparameters,
11
+ hyperparameters: Optional[Dict] = None,
12
+ verbose: bool = True,
11
13
  ) -> Union[Dict[str, Union[str, int, float, PromptApi]], None]:
12
14
  if hyperparameters is None:
13
15
  return None
@@ -16,6 +18,7 @@ def process_hyperparameters(
16
18
  raise TypeError("Hyperparameters must be a dictionary or None")
17
19
 
18
20
  processed_hyperparameters = {}
21
+ prompts_version_id_map = {}
19
22
 
20
23
  for key, value in hyperparameters.items():
21
24
  if not isinstance(key, str):
@@ -30,14 +33,21 @@ def process_hyperparameters(
30
33
  )
31
34
 
32
35
  if isinstance(value, Prompt):
33
- if value._prompt_version_id is not None and value._type is not None:
36
+ prompt_key = f"{value.alias}_{value.version}"
37
+ if value._prompt_version_id is not None and value.type is not None:
34
38
  processed_hyperparameters[key] = PromptApi(
35
39
  id=value._prompt_version_id,
36
- type=value._type,
40
+ type=value.type,
37
41
  )
38
- else:
39
- raise ValueError(
40
- f"Cannot log Prompt where template was not pulled from Confident AI. Please import your prompt on Confident AI to continue."
42
+ elif is_confident():
43
+ if prompt_key not in prompts_version_id_map:
44
+ value.push(_verbose=verbose)
45
+ prompts_version_id_map[prompt_key] = (
46
+ value._prompt_version_id
47
+ )
48
+ processed_hyperparameters[key] = PromptApi(
49
+ id=prompts_version_id_map[prompt_key],
50
+ type=value.type,
41
51
  )
42
52
  else:
43
53
  processed_hyperparameters[key] = str(value)
@@ -64,3 +74,32 @@ def log_hyperparameters(func):
64
74
 
65
75
  # Return the wrapper function to be used as the decorator
66
76
  return wrapper
77
+
78
+
79
+ def process_prompts(
80
+ hyperparameters: Dict[str, Union[str, int, float, Prompt]],
81
+ ) -> List[PromptData]:
82
+ prompts = []
83
+ if not hyperparameters:
84
+ return prompts
85
+ seen_prompts = set()
86
+ prompt_objects = [
87
+ value for value in hyperparameters.values() if isinstance(value, Prompt)
88
+ ]
89
+ for prompt in prompt_objects:
90
+ prompt_version = prompt.version if is_confident() else None
91
+ prompt_key = f"{prompt.alias}_{prompt_version}"
92
+ if prompt_key in seen_prompts:
93
+ continue
94
+ seen_prompts.add(prompt_key)
95
+ prompt_data = PromptData(
96
+ alias=prompt.alias,
97
+ version=prompt_version,
98
+ text_template=prompt.text_template,
99
+ messages_template=prompt.messages_template,
100
+ model_settings=prompt.model_settings,
101
+ output_type=prompt.output_type,
102
+ interpolation_type=prompt.interpolation_type,
103
+ )
104
+ prompts.append(prompt_data)
105
+ return prompts
@@ -32,6 +32,17 @@ from deepeval.utils import (
32
32
  )
33
33
  from deepeval.test_run.cache import global_test_run_cache_manager
34
34
  from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
35
+ from deepeval.prompt import (
36
+ PromptMessage,
37
+ ModelSettings,
38
+ OutputType,
39
+ PromptInterpolationType,
40
+ OutputType,
41
+ )
42
+ from rich.panel import Panel
43
+ from rich.text import Text
44
+ from rich.columns import Columns
45
+
35
46
 
36
47
  TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
37
48
  LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
@@ -71,6 +82,16 @@ class TraceMetricScores(BaseModel):
71
82
  base: Dict[str, Dict[str, MetricScores]] = Field(default_factory=dict)
72
83
 
73
84
 
85
+ class PromptData(BaseModel):
86
+ alias: Optional[str] = None
87
+ version: Optional[str] = None
88
+ text_template: Optional[str] = None
89
+ messages_template: Optional[List[PromptMessage]] = None
90
+ model_settings: Optional[ModelSettings] = None
91
+ output_type: Optional[OutputType] = None
92
+ interpolation_type: Optional[PromptInterpolationType] = None
93
+
94
+
74
95
  class MetricsAverageDict:
75
96
  def __init__(self):
76
97
  self.metric_dict = {}
@@ -123,6 +144,7 @@ class TestRun(BaseModel):
123
144
  )
124
145
  identifier: Optional[str] = None
125
146
  hyperparameters: Optional[Dict[str, Any]] = Field(None)
147
+ prompts: Optional[List[PromptData]] = Field(None)
126
148
  test_passed: Optional[int] = Field(None, alias="testPassed")
127
149
  test_failed: Optional[int] = Field(None, alias="testFailed")
128
150
  run_duration: float = Field(0.0, alias="runDuration")
@@ -799,6 +821,7 @@ class TestRunManager:
799
821
  test_run.test_cases = initial_batch
800
822
 
801
823
  try:
824
+ test_run.prompts = None
802
825
  body = test_run.model_dump(by_alias=True, exclude_none=True)
803
826
  except AttributeError:
804
827
  # Pydantic version below 2.0
@@ -953,6 +976,23 @@ class TestRunManager:
953
976
  if display_table:
954
977
  self.display_results_table(test_run, display)
955
978
 
979
+ if test_run.hyperparameters is None:
980
+ console.print(
981
+ "\n[bold yellow]⚠ WARNING:[/bold yellow] No hyperparameters logged.\n"
982
+ "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log hyperparameters[/link][/bold blue] to attribute prompts and models to your test runs.\n\n"
983
+ + "=" * 80
984
+ )
985
+ else:
986
+ if not test_run.prompts:
987
+ console.print(
988
+ "\n[bold yellow]⚠ WARNING:[/bold yellow] No prompts logged.\n"
989
+ "» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log prompts[/link][/bold blue] to evaluate and optimize your prompt templates and models.\n\n"
990
+ + "=" * 80
991
+ )
992
+ else:
993
+ console.print("\n[bold green]✓ Prompts Logged[/bold green]\n")
994
+ self._render_prompts_panels(prompts=test_run.prompts)
995
+
956
996
  self.save_test_run_locally()
957
997
  delete_file_if_exists(self.temp_file_path)
958
998
  if is_confident() and self.disable_request is False:
@@ -967,7 +1007,7 @@ class TestRunManager:
967
1007
  f"» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
968
1008
  f" » Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
969
1009
  "=" * 80,
970
- "\n\n» What to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
1010
+ "\n\n» Want to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
971
1011
  " » Run [bold]'deepeval view'[/bold] to analyze and save testing results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n",
972
1012
  )
973
1013
 
@@ -993,5 +1033,68 @@ class TestRunManager:
993
1033
  pass
994
1034
  return None
995
1035
 
1036
+ def _render_prompts_panels(self, prompts: List[PromptData]) -> None:
1037
+
1038
+ def format_string(
1039
+ v, default="[dim]None[/dim]", color: Optional[str] = None
1040
+ ):
1041
+ formatted_string = str(v) if v not in (None, "", []) else default
1042
+ return (
1043
+ f"{formatted_string}"
1044
+ if color is None or v in (None, "", [])
1045
+ else f"[{color}]{formatted_string}[/]"
1046
+ )
1047
+
1048
+ panels = []
1049
+ for prompt in prompts:
1050
+ lines = []
1051
+ p_type = (
1052
+ "messages"
1053
+ if prompt.messages_template
1054
+ else ("text" if prompt.text_template else "—")
1055
+ )
1056
+ if p_type:
1057
+ lines.append(f"type: {format_string(p_type, color='blue')}")
1058
+ if prompt.output_type:
1059
+ lines.append(
1060
+ f"output_type: {format_string(prompt.output_type, color='blue')}"
1061
+ )
1062
+ if prompt.interpolation_type:
1063
+ lines.append(
1064
+ f"interpolation_type: {format_string(prompt.interpolation_type, color='blue')}"
1065
+ )
1066
+ if prompt.model_settings:
1067
+ ms = prompt.model_settings
1068
+ settings_lines = [
1069
+ "Model Settings:",
1070
+ f" – provider: {format_string(ms.provider, color='green')}",
1071
+ f" – name: {format_string(ms.name, color='green')}",
1072
+ f" – temperature: {format_string(ms.temperature, color='green')}",
1073
+ f" – max_tokens: {format_string(ms.max_tokens, color='green')}",
1074
+ f" – top_p: {format_string(ms.top_p, color='green')}",
1075
+ f" – frequency_penalty: {format_string(ms.frequency_penalty, color='green')}",
1076
+ f" – presence_penalty: {format_string(ms.presence_penalty, color='green')}",
1077
+ f" – stop_sequence: {format_string(ms.stop_sequence, color='green')}",
1078
+ f" – reasoning_effort: {format_string(ms.reasoning_effort, color='green')}",
1079
+ f" – verbosity: {format_string(ms.verbosity, color='green')}",
1080
+ ]
1081
+ lines.append("")
1082
+ lines.extend(settings_lines)
1083
+ title = f"{format_string(prompt.alias)}"
1084
+ if prompt.version:
1085
+ title += f" (v{prompt.version})"
1086
+ body = "\n".join(lines)
1087
+ panel = Panel(
1088
+ body,
1089
+ title=title,
1090
+ title_align="left",
1091
+ expand=False,
1092
+ padding=(1, 6, 1, 2),
1093
+ )
1094
+ panels.append(panel)
1095
+
1096
+ if panels:
1097
+ console.print(Columns(panels, equal=False, expand=False))
1098
+
996
1099
 
997
1100
  global_test_run_manager = TestRunManager()
deepeval/tracing/api.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from enum import Enum
2
2
  from typing import Dict, List, Optional, Union, Literal, Any
3
- from pydantic import BaseModel, Field
3
+ from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
5
  from deepeval.test_case import ToolCall
6
6
 
@@ -27,6 +27,8 @@ class PromptApi(BaseModel):
27
27
 
28
28
 
29
29
  class MetricData(BaseModel):
30
+ model_config = ConfigDict(extra="ignore")
31
+
30
32
  name: str
31
33
  threshold: float
32
34
  success: bool
@@ -0,0 +1,10 @@
1
+ from .messages import TextMessage, ToolCallMessage
2
+ from .tools import BaseTool, ToolSchema, ToolOutput
3
+
4
+ __all__ = [
5
+ "BaseTool",
6
+ "TextMessage",
7
+ "ToolCallMessage",
8
+ "ToolSchema",
9
+ "ToolOutput",
10
+ ]
@@ -0,0 +1,6 @@
1
+ from typing import Literal
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class BaseMessage(BaseModel):
6
+ role: Literal["user", "assistant"]
@@ -0,0 +1,14 @@
1
+ from typing import Literal, Dict, Any
2
+ from .base import BaseMessage
3
+
4
+
5
+ class TextMessage(BaseMessage):
6
+ type: Literal["text", "thinking"]
7
+ content: str
8
+
9
+
10
+ class ToolCallMessage(BaseMessage):
11
+ """This is a message for tool calls in response.choices[0].message.tool_calls"""
12
+
13
+ name: str
14
+ args: Dict[str, Any]
@@ -0,0 +1,18 @@
1
+ from typing import Any, Optional, Dict
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class BaseTool(BaseModel):
6
+ name: str
7
+ description: Optional[str] = None
8
+
9
+
10
+ class ToolSchema(BaseTool):
11
+ parameters: Dict[str, Any]
12
+ is_called: Optional[bool] = False
13
+
14
+
15
+ class ToolOutput(BaseTool):
16
+ """Output of the tool function"""
17
+
18
+ output: Any
@@ -3,7 +3,7 @@ import json
3
3
  from typing import List, Optional, Tuple, Any
4
4
  from opentelemetry.sdk.trace.export import ReadableSpan
5
5
 
6
- from deepeval.evaluate.utils import create_api_test_case
6
+ from deepeval.test_case.api import create_api_test_case
7
7
  from deepeval.test_run.api import LLMApiTestCase
8
8
  from deepeval.test_run.test_run import global_test_run_manager
9
9
  from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
@@ -1,14 +1,83 @@
1
- from .context import current_trace_context
2
- from .tracing import trace_manager
1
+ from typing import Optional, List, Dict, Any
2
+ from contextvars import ContextVar
3
3
  from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+
6
+ from .tracing import trace_manager
7
+ from .context import current_trace_context, update_current_trace
8
+ from deepeval.prompt import Prompt
9
+ from deepeval.metrics import BaseMetric
10
+ from deepeval.test_case.llm_test_case import ToolCall
11
+
12
+
13
+ @dataclass
14
+ class LlmContext:
15
+ prompt: Optional[Prompt] = None
16
+ metrics: Optional[List[BaseMetric]] = None
17
+ metric_collection: Optional[str] = None
18
+ expected_output: Optional[str] = None
19
+ expected_tools: Optional[List[ToolCall]] = None
20
+ context: Optional[List[str]] = None
21
+ retrieval_context: Optional[List[str]] = None
22
+
23
+
24
+ current_llm_context: ContextVar[Optional[LlmContext]] = ContextVar(
25
+ "current_llm_context", default=LlmContext()
26
+ )
4
27
 
5
28
 
6
29
  @contextmanager
7
- def trace():
30
+ def trace(
31
+ prompt: Optional[Prompt] = None,
32
+ llm_metrics: Optional[List[BaseMetric]] = None,
33
+ llm_metric_collection: Optional[str] = None,
34
+ name: Optional[str] = None,
35
+ tags: Optional[List[str]] = None,
36
+ metadata: Optional[Dict[str, Any]] = None,
37
+ user_id: Optional[str] = None,
38
+ thread_id: Optional[str] = None,
39
+ expected_output: Optional[str] = None,
40
+ expected_tools: Optional[List[ToolCall]] = None,
41
+ context: Optional[List[str]] = None,
42
+ retrieval_context: Optional[List[str]] = None,
43
+ trace_metric_collection: Optional[str] = None,
44
+ trace_metrics: Optional[List[BaseMetric]] = None,
45
+ ):
8
46
  current_trace = current_trace_context.get()
9
47
 
10
48
  if not current_trace:
11
49
  current_trace = trace_manager.start_new_trace()
12
- current_trace_context.set(current_trace)
50
+
51
+ if trace_metrics:
52
+ current_trace.metrics = trace_metrics
53
+
54
+ if trace_metric_collection:
55
+ current_trace.metric_collection = trace_metric_collection
56
+
57
+ current_trace_context.set(current_trace)
58
+
59
+ current_llm_context.set(
60
+ LlmContext(
61
+ prompt=prompt,
62
+ metrics=llm_metrics,
63
+ metric_collection=llm_metric_collection,
64
+ expected_output=expected_output,
65
+ expected_tools=expected_tools,
66
+ context=context,
67
+ retrieval_context=retrieval_context,
68
+ )
69
+ )
70
+
71
+ # set the current trace attributes
72
+ if name:
73
+ update_current_trace(name=name)
74
+ if tags:
75
+ update_current_trace(tags=tags)
76
+ if metadata:
77
+ update_current_trace(metadata=metadata)
78
+ if user_id:
79
+ update_current_trace(user_id=user_id)
80
+ if thread_id:
81
+ update_current_trace(thread_id=thread_id)
13
82
 
14
83
  yield current_trace
@@ -1,5 +1,14 @@
1
- import os
2
- from typing import Any, Dict, List, Literal, Optional, Set, Union, Callable
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Any,
4
+ Callable,
5
+ Dict,
6
+ List,
7
+ Literal,
8
+ Optional,
9
+ Set,
10
+ Union,
11
+ )
3
12
  from time import perf_counter
4
13
  import threading
5
14
  import functools
@@ -20,6 +29,7 @@ from deepeval.constants import (
20
29
  )
21
30
  from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
22
31
  from deepeval.metrics import BaseMetric
32
+ from deepeval.test_case.llm_test_case import ToolCall
23
33
  from deepeval.tracing.api import (
24
34
  BaseApiSpan,
25
35
  SpanApiType,
@@ -41,6 +51,7 @@ from deepeval.tracing.types import (
41
51
  )
42
52
  from deepeval.tracing.utils import (
43
53
  Environment,
54
+ prepare_tool_call_input_parameters,
44
55
  replace_self_with_class_name,
45
56
  make_json_serializable,
46
57
  perf_counter_to_datetime,
@@ -55,6 +66,10 @@ from deepeval.tracing.types import TestCaseMetricPair
55
66
  from deepeval.tracing.api import PromptApi
56
67
  from deepeval.tracing.trace_test_manager import trace_testing_manager
57
68
 
69
+
70
+ if TYPE_CHECKING:
71
+ from deepeval.dataset.golden import Golden
72
+
58
73
  EVAL_DUMMY_SPAN_NAME = "evals_iterator"
59
74
 
60
75
 
@@ -65,6 +80,10 @@ class TraceManager:
65
80
  self.active_spans: Dict[str, BaseSpan] = (
66
81
  {}
67
82
  ) # Map of span_uuid to BaseSpan
83
+ # Map each trace created during evaluation_loop to the Golden that was active
84
+ # when it was started. This lets us evaluate traces against the correct golden
85
+ # since we cannot rely on positional indexing as the order is not guaranteed.
86
+ self.trace_uuid_to_golden: Dict[str, Golden] = {}
68
87
 
69
88
  settings = get_settings()
70
89
  # Initialize queue and worker thread for trace posting
@@ -86,7 +105,7 @@ class TraceManager:
86
105
  )
87
106
  validate_environment(self.environment)
88
107
 
89
- self.sampling_rate = settings.CONFIDENT_SAMPLE_RATE
108
+ self.sampling_rate = settings.CONFIDENT_TRACE_SAMPLE_RATE
90
109
  validate_sampling_rate(self.sampling_rate)
91
110
  self.openai_client = None
92
111
  self.tracing_enabled = True
@@ -166,6 +185,19 @@ class TraceManager:
166
185
  self.traces.append(new_trace)
167
186
  if self.evaluation_loop:
168
187
  self.traces_to_evaluate_order.append(trace_uuid)
188
+ # Associate the current Golden with this trace so we can
189
+ # later evaluate traces against the correct golden, even if more traces
190
+ # are created than goldens or the order interleaves.
191
+ try:
192
+ from deepeval.contextvars import get_current_golden
193
+
194
+ current_golden = get_current_golden()
195
+ if current_golden is not None:
196
+ self.trace_uuid_to_golden[trace_uuid] = current_golden
197
+ except Exception:
198
+ # not much we can do, but if the golden is not there during evaluation
199
+ # we will write out a verbose debug log
200
+ pass
169
201
  return new_trace
170
202
 
171
203
  def end_trace(self, trace_uuid: str):
@@ -861,6 +893,22 @@ class Observer:
861
893
  ):
862
894
  current_span.prompt = self.prompt
863
895
 
896
+ if not current_span.tools_called:
897
+ # check any tool span children
898
+ for child in current_span.children:
899
+ if isinstance(child, ToolSpan):
900
+ current_span.tools_called = current_span.tools_called or []
901
+ current_span.tools_called.append(
902
+ ToolCall(
903
+ name=child.name,
904
+ description=child.description,
905
+ input_parameters=prepare_tool_call_input_parameters(
906
+ child.input
907
+ ),
908
+ output=child.output,
909
+ )
910
+ )
911
+
864
912
  trace_manager.remove_span(self.uuid)
865
913
  if current_span.parent_uuid:
866
914
  parent_span = trace_manager.get_span_by_uuid(
deepeval/tracing/types.py CHANGED
@@ -3,6 +3,12 @@ from dataclasses import dataclass, field
3
3
  from pydantic import BaseModel, Field
4
4
  from typing import Any, Dict, List, Optional, Union
5
5
  from rich.progress import Progress
6
+ from deepeval.tracing.message_types import (
7
+ ToolSchema,
8
+ ToolOutput,
9
+ TextMessage,
10
+ ToolCallMessage,
11
+ )
6
12
 
7
13
  from deepeval.prompt.prompt import Prompt
8
14
  from deepeval.test_case.llm_test_case import ToolCall
@@ -88,6 +94,12 @@ class AgentSpan(BaseSpan):
88
94
 
89
95
 
90
96
  class LlmSpan(BaseSpan):
97
+ input: Optional[
98
+ Union[Any, List[Union[TextMessage, ToolCallMessage, ToolOutput]]]
99
+ ] = None
100
+ output: Optional[Union[Any, List[Union[TextMessage, ToolCallMessage]]]] = (
101
+ None
102
+ )
91
103
  model: Optional[str] = None
92
104
  prompt: Optional[Prompt] = None
93
105
  input_token_count: Optional[float] = Field(
@@ -106,6 +118,10 @@ class LlmSpan(BaseSpan):
106
118
  None, serialization_alias="tokenTimes"
107
119
  )
108
120
 
121
+ # input_tools: Optional[List[ToolSchema]] = Field(None, serialization_alias="inputTools")
122
+ # invocation_params: Optional[Dict[str, Any]] = Field(None, serialization_alias="invocationParams")
123
+ # output_metadata: Optional[Dict[str, Any]] = Field(None, serialization_alias="outputMetadata")
124
+
109
125
  # for serializing `prompt`
110
126
  model_config = {"arbitrary_types_allowed": True}
111
127