deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  4. deepeval/cli/main.py +42 -0
  5. deepeval/confident/api.py +1 -0
  6. deepeval/config/logging.py +33 -0
  7. deepeval/config/settings.py +176 -16
  8. deepeval/constants.py +8 -1
  9. deepeval/dataset/dataset.py +2 -11
  10. deepeval/dataset/utils.py +1 -1
  11. deepeval/evaluate/evaluate.py +5 -1
  12. deepeval/evaluate/execute.py +118 -60
  13. deepeval/evaluate/utils.py +20 -116
  14. deepeval/integrations/crewai/__init__.py +6 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/wrapper.py +45 -5
  18. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  19. deepeval/metrics/api.py +281 -0
  20. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  21. deepeval/metrics/bias/bias.py +12 -3
  22. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  23. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  24. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  25. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  26. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  27. deepeval/metrics/conversational_dag/nodes.py +12 -4
  28. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  29. deepeval/metrics/dag/dag.py +12 -0
  30. deepeval/metrics/dag/nodes.py +12 -4
  31. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  32. deepeval/metrics/g_eval/g_eval.py +37 -15
  33. deepeval/metrics/hallucination/hallucination.py +12 -1
  34. deepeval/metrics/indicator.py +8 -2
  35. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  36. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  37. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  38. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  39. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  40. deepeval/metrics/misuse/misuse.py +12 -1
  41. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  43. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  44. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  45. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  47. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  50. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  51. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  52. deepeval/metrics/non_advice/non_advice.py +12 -0
  53. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  54. deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
  55. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  56. deepeval/metrics/role_violation/role_violation.py +12 -0
  57. deepeval/metrics/summarization/summarization.py +12 -1
  58. deepeval/metrics/task_completion/task_completion.py +3 -0
  59. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  60. deepeval/metrics/toxicity/toxicity.py +12 -0
  61. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  62. deepeval/models/llms/grok_model.py +1 -1
  63. deepeval/models/llms/openai_model.py +2 -0
  64. deepeval/models/retry_policy.py +202 -11
  65. deepeval/openai/__init__.py +14 -32
  66. deepeval/openai/extractors.py +24 -34
  67. deepeval/openai/patch.py +256 -161
  68. deepeval/openai/types.py +20 -0
  69. deepeval/openai/utils.py +98 -56
  70. deepeval/prompt/__init__.py +19 -1
  71. deepeval/prompt/api.py +160 -0
  72. deepeval/prompt/prompt.py +244 -62
  73. deepeval/prompt/utils.py +144 -2
  74. deepeval/synthesizer/chunking/context_generator.py +209 -152
  75. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  76. deepeval/synthesizer/synthesizer.py +8 -5
  77. deepeval/test_case/api.py +131 -0
  78. deepeval/test_run/__init__.py +1 -0
  79. deepeval/test_run/hyperparameters.py +47 -8
  80. deepeval/test_run/test_run.py +104 -1
  81. deepeval/tracing/api.py +3 -1
  82. deepeval/tracing/message_types/__init__.py +10 -0
  83. deepeval/tracing/message_types/base.py +6 -0
  84. deepeval/tracing/message_types/messages.py +14 -0
  85. deepeval/tracing/message_types/tools.py +18 -0
  86. deepeval/tracing/otel/exporter.py +0 -6
  87. deepeval/tracing/otel/utils.py +58 -8
  88. deepeval/tracing/trace_context.py +73 -4
  89. deepeval/tracing/trace_test_manager.py +19 -0
  90. deepeval/tracing/tracing.py +52 -4
  91. deepeval/tracing/types.py +16 -0
  92. deepeval/tracing/utils.py +8 -0
  93. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  94. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
  95. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  96. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  97. {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -1,29 +1,15 @@
1
+ import json
1
2
  from openai.types.chat import ChatCompletion, ParsedChatCompletion
2
- from typing import Optional, Union, List, Dict
3
+ from typing import Any, Union, Dict
3
4
  from openai.types.responses import Response
4
- from pydantic import BaseModel
5
- import json
6
5
 
7
6
  from deepeval.test_case.llm_test_case import ToolCall
8
-
9
-
10
- class InputParameters(BaseModel):
11
- model: Optional[str] = None
12
- input: Optional[str] = None
13
- instructions: Optional[str] = None
14
- messages: Optional[List[Dict]] = None
15
- tool_descriptions: Optional[Dict[str, str]] = None
16
-
17
-
18
- class OutputParameters(BaseModel):
19
- output: Optional[str] = None
20
- prompt_tokens: Optional[int] = None
21
- completion_tokens: Optional[int] = None
22
- tools_called: Optional[List[ToolCall]] = None
7
+ from deepeval.openai.utils import stringify_multimodal_content
8
+ from deepeval.openai.types import InputParameters, OutputParameters
23
9
 
24
10
 
25
11
  def extract_input_parameters(
26
- is_completion: bool, kwargs: Dict
12
+ is_completion: bool, kwargs: Dict[str, Any]
27
13
  ) -> InputParameters:
28
14
  if is_completion:
29
15
  return extract_input_parameters_from_completion(kwargs)
@@ -31,9 +17,11 @@ def extract_input_parameters(
31
17
  return extract_input_parameters_from_response(kwargs)
32
18
 
33
19
 
34
- def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
20
+ def extract_input_parameters_from_completion(
21
+ kwargs: Dict[str, Any],
22
+ ) -> InputParameters:
35
23
  model = kwargs.get("model")
36
- messages = kwargs.get("messages")
24
+ messages = kwargs.get("messages") or []
37
25
  tools = kwargs.get("tools")
38
26
  tool_descriptions_map = (
39
27
  {
@@ -45,7 +33,7 @@ def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
45
33
  )
46
34
 
47
35
  # extract first user input from messages
48
- input = ""
36
+ input_arg = ""
49
37
  user_messages = []
50
38
  for message in messages:
51
39
  role = message["role"]
@@ -53,20 +41,22 @@ def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
53
41
  if role == "user":
54
42
  user_messages.append(content)
55
43
  if len(user_messages) > 0:
56
- input = user_messages[0]
44
+ input_arg = user_messages[0]
57
45
 
58
46
  return InputParameters(
59
47
  model=model,
60
- input=input,
48
+ input=stringify_multimodal_content(input_arg),
61
49
  messages=messages,
62
50
  tools=tools,
63
51
  tool_descriptions=tool_descriptions_map,
64
52
  )
65
53
 
66
54
 
67
- def extract_input_parameters_from_response(kwargs: Dict) -> InputParameters:
55
+ def extract_input_parameters_from_response(
56
+ kwargs: Dict[str, Any],
57
+ ) -> InputParameters:
68
58
  model = kwargs.get("model")
69
- input = kwargs.get("input")
59
+ input_payload = kwargs.get("input")
70
60
  instructions = kwargs.get("instructions")
71
61
  tools = kwargs.get("tools")
72
62
  tool_descriptions = (
@@ -74,9 +64,11 @@ def extract_input_parameters_from_response(kwargs: Dict) -> InputParameters:
74
64
  if tools is not None
75
65
  else None
76
66
  )
67
+ messages = input_payload if isinstance(input_payload, list) else None
77
68
  return InputParameters(
78
69
  model=model,
79
- input=input,
70
+ input=stringify_multimodal_content(input_payload),
71
+ messages=messages,
80
72
  instructions=instructions,
81
73
  tools=tools,
82
74
  tool_descriptions=tool_descriptions,
@@ -102,7 +94,7 @@ def extract_output_parameters_from_completion(
102
94
  completion: Union[ChatCompletion, ParsedChatCompletion],
103
95
  input_parameters: InputParameters,
104
96
  ) -> OutputParameters:
105
- output = str(completion.choices[0].message.content)
97
+ output = str(completion.choices[0].message.content or "")
106
98
  prompt_tokens = completion.usage.prompt_tokens
107
99
  completion_tokens = completion.usage.completion_tokens
108
100
 
@@ -112,13 +104,12 @@ def extract_output_parameters_from_completion(
112
104
  if openai_tool_calls is not None:
113
105
  tools_called = []
114
106
  for tool_call in openai_tool_calls:
107
+ tool_descriptions = input_parameters.tool_descriptions or {}
115
108
  tools_called.append(
116
109
  ToolCall(
117
110
  name=tool_call.function.name,
118
111
  input_parameters=json.loads(tool_call.function.arguments),
119
- description=input_parameters.tool_descriptions.get(
120
- tool_call.function.name
121
- ),
112
+ description=tool_descriptions.get(tool_call.function.name),
122
113
  )
123
114
  )
124
115
 
@@ -145,13 +136,12 @@ def extract_output_parameters_from_response(
145
136
  for tool_call in openai_raw_output:
146
137
  if tool_call.type != "function_call":
147
138
  continue
139
+ tool_descriptions = input_parameters.tool_descriptions or {}
148
140
  tools_called.append(
149
141
  ToolCall(
150
142
  name=tool_call.name,
151
143
  input_parameters=json.loads(tool_call.arguments),
152
- description=input_parameters.tool_descriptions.get(
153
- tool_call.name
154
- ),
144
+ description=tool_descriptions.get(tool_call.name),
155
145
  )
156
146
  )
157
147
 
deepeval/openai/patch.py CHANGED
@@ -1,204 +1,299 @@
1
- from typing import Callable, List, Optional
1
+ from typing import Callable, List
2
2
  from functools import wraps
3
3
 
4
- from deepeval.openai.utils import (
5
- get_attr_path,
6
- set_attr_path,
7
- add_test_case,
8
- create_child_tool_spans,
9
- )
4
+
10
5
  from deepeval.openai.extractors import (
11
6
  extract_output_parameters,
12
7
  extract_input_parameters,
13
8
  InputParameters,
14
- ToolCall,
9
+ OutputParameters,
10
+ )
11
+ from deepeval.test_case.llm_test_case import ToolCall
12
+ from deepeval.tracing.context import (
13
+ current_trace_context,
14
+ update_current_span,
15
+ update_llm_span,
15
16
  )
16
- from deepeval.tracing.context import update_current_span, update_llm_span
17
- from deepeval.tracing import trace_manager, observe
18
- from deepeval.metrics.base_metric import BaseMetric
19
- from deepeval.test_case import LLMTestCase
17
+ from deepeval.tracing import observe
18
+ from deepeval.tracing.trace_context import current_llm_context
19
+ from deepeval.openai.utils import create_child_tool_spans
20
+
21
+ # Store original methods for safety and potential unpatching
22
+ _ORIGINAL_METHODS = {}
23
+ _OPENAI_PATCHED = False
24
+
20
25
 
26
+ def patch_openai_classes():
27
+ """Monkey patch OpenAI resource classes directly."""
28
+ global _OPENAI_PATCHED
21
29
 
22
- def patch_openai(openai_module):
23
- if getattr(openai_module, "_deepeval_patched", False):
30
+ # Single guard - if already patched, return immediately
31
+ if _OPENAI_PATCHED:
24
32
  return
25
33
 
26
- openai_module._deepeval_patched = True
27
- openai_class = getattr(openai_module, "OpenAI", None)
28
- async_openai_class = getattr(openai_module, "AsyncOpenAI", None)
29
-
30
- if openai_class:
31
- patch_openai_client(openai_class, is_async=False)
32
- if async_openai_class:
33
- patch_openai_client(async_openai_class, is_async=True)
34
-
35
-
36
- def patch_openai_client(openai_class, is_async: bool):
37
- original_init = openai_class.__init__
38
-
39
- @wraps(original_init)
40
- def new_init(self, *args, **kwargs):
41
- original_init(self, *args, **kwargs)
42
- method_paths = {
43
- # path → is_completion_method
44
- "chat.completions.create": True,
45
- "beta.chat.completions.parse": True,
46
- "responses.create": False,
47
- }
48
- for path, is_completion in method_paths.items():
49
- method = get_attr_path(self, path)
50
- if not callable(method):
51
- continue
52
- if is_async:
53
- patched_method = patch_async_openai_client_method(
54
- orig_method=method,
55
- is_completion_method=is_completion,
56
- )
57
- else:
58
- patched_method = patch_sync_openai_client_method(
59
- orig_method=method,
60
- is_completion_method=is_completion,
61
- )
62
- set_attr_path(self, path, patched_method)
63
-
64
- openai_class.__init__ = new_init
65
-
66
-
67
- def patch_async_openai_client_method(
34
+ try:
35
+ from openai.resources.chat.completions import (
36
+ Completions,
37
+ AsyncCompletions,
38
+ )
39
+
40
+ # Store original methods before patching
41
+ if hasattr(Completions, "create"):
42
+ _ORIGINAL_METHODS["Completions.create"] = Completions.create
43
+ Completions.create = _create_sync_wrapper(
44
+ Completions.create, is_completion_method=True
45
+ )
46
+
47
+ if hasattr(Completions, "parse"):
48
+ _ORIGINAL_METHODS["Completions.parse"] = Completions.parse
49
+ Completions.parse = _create_sync_wrapper(
50
+ Completions.parse, is_completion_method=True
51
+ )
52
+
53
+ if hasattr(AsyncCompletions, "create"):
54
+ _ORIGINAL_METHODS["AsyncCompletions.create"] = (
55
+ AsyncCompletions.create
56
+ )
57
+ AsyncCompletions.create = _create_async_wrapper(
58
+ AsyncCompletions.create, is_completion_method=True
59
+ )
60
+
61
+ if hasattr(AsyncCompletions, "parse"):
62
+ _ORIGINAL_METHODS["AsyncCompletions.parse"] = AsyncCompletions.parse
63
+ AsyncCompletions.parse = _create_async_wrapper(
64
+ AsyncCompletions.parse, is_completion_method=True
65
+ )
66
+
67
+ except ImportError:
68
+ pass
69
+
70
+ try:
71
+ from openai.resources.responses import Responses, AsyncResponses
72
+
73
+ if hasattr(Responses, "create"):
74
+ _ORIGINAL_METHODS["Responses.create"] = Responses.create
75
+ Responses.create = _create_sync_wrapper(
76
+ Responses.create, is_completion_method=False
77
+ )
78
+
79
+ if hasattr(AsyncResponses, "create"):
80
+ _ORIGINAL_METHODS["AsyncResponses.create"] = AsyncResponses.create
81
+ AsyncResponses.create = _create_async_wrapper(
82
+ AsyncResponses.create, is_completion_method=False
83
+ )
84
+
85
+ except ImportError:
86
+ pass
87
+
88
+ # Set flag at the END after successful patching
89
+ _OPENAI_PATCHED = True
90
+
91
+
92
+ def _create_sync_wrapper(original_method, is_completion_method: bool):
93
+ """Create a wrapper for sync methods - called ONCE during patching."""
94
+
95
+ @wraps(original_method)
96
+ def method_wrapper(self, *args, **kwargs):
97
+ bound_method = original_method.__get__(self, type(self))
98
+ patched = _patch_sync_openai_client_method(
99
+ orig_method=bound_method, is_completion_method=is_completion_method
100
+ )
101
+ return patched(*args, **kwargs)
102
+
103
+ return method_wrapper
104
+
105
+
106
+ def _create_async_wrapper(original_method, is_completion_method: bool):
107
+ """Create a wrapper for async methods - called ONCE during patching."""
108
+
109
+ @wraps(original_method)
110
+ async def method_wrapper(self, *args, **kwargs):
111
+ bound_method = original_method.__get__(self, type(self))
112
+ patched = _patch_async_openai_client_method(
113
+ orig_method=bound_method, is_completion_method=is_completion_method
114
+ )
115
+ return await patched(*args, **kwargs)
116
+
117
+ return method_wrapper
118
+
119
+
120
+ def _patch_async_openai_client_method(
68
121
  orig_method: Callable,
69
122
  is_completion_method: bool = False,
70
123
  ):
71
124
  @wraps(orig_method)
72
- async def patched_async_openai_method(
73
- metrics: Optional[List[BaseMetric]] = None,
74
- context: Optional[List[str]] = None,
75
- retrieval_context: Optional[List[str]] = None,
76
- expected_output: Optional[str] = None,
77
- expected_tools: Optional[List[ToolCall]] = None,
78
- *args,
79
- **kwargs
80
- ):
125
+ async def patched_async_openai_method(*args, **kwargs):
81
126
  input_parameters: InputParameters = extract_input_parameters(
82
127
  is_completion_method, kwargs
83
128
  )
84
- is_traced = len(trace_manager.traces) > 0
85
-
86
- if is_traced:
87
-
88
- @observe(type="llm", model=input_parameters.model, metrics=metrics)
89
- async def llm_generation(*args, **kwargs):
90
- response = await orig_method(*args, **kwargs)
91
- output_parameters = extract_output_parameters(
92
- is_completion_method, response, input_parameters
93
- )
94
- update_current_span(
95
- input=input_parameters.input
96
- or input_parameters.messages
97
- or "NA",
98
- output=output_parameters.output or "NA",
99
- expected_output=expected_output,
100
- retrieval_context=retrieval_context,
101
- context=context,
102
- tools_called=output_parameters.tools_called,
103
- expected_tools=expected_tools,
104
- )
105
- update_llm_span(
106
- input_token_count=output_parameters.prompt_tokens,
107
- output_token_count=output_parameters.completion_tokens,
108
- )
109
- create_child_tool_spans(output_parameters)
110
- return response
111
-
112
- return await llm_generation(*args, **kwargs)
113
- else:
129
+
130
+ llm_context = current_llm_context.get()
131
+
132
+ @observe(
133
+ type="llm",
134
+ model=input_parameters.model,
135
+ metrics=llm_context.metrics,
136
+ metric_collection=llm_context.metric_collection,
137
+ )
138
+ async def llm_generation(*args, **kwargs):
114
139
  response = await orig_method(*args, **kwargs)
115
140
  output_parameters = extract_output_parameters(
116
141
  is_completion_method, response, input_parameters
117
142
  )
118
- test_case = LLMTestCase(
119
- input=input_parameters.input,
120
- actual_output=output_parameters.output,
121
- expected_output=expected_output,
122
- retrieval_context=retrieval_context,
123
- context=context,
124
- tools_called=output_parameters.tools_called,
125
- expected_tools=expected_tools,
126
- )
127
- add_test_case(
128
- test_case=test_case,
129
- metrics=metrics,
130
- input_parameters=input_parameters,
143
+ _update_all_attributes(
144
+ input_parameters,
145
+ output_parameters,
146
+ llm_context.expected_tools,
147
+ llm_context.expected_output,
148
+ llm_context.context,
149
+ llm_context.retrieval_context,
131
150
  )
151
+
132
152
  return response
133
153
 
154
+ return await llm_generation(*args, **kwargs)
155
+
134
156
  return patched_async_openai_method
135
157
 
136
158
 
137
- def patch_sync_openai_client_method(
159
+ def _patch_sync_openai_client_method(
138
160
  orig_method: Callable,
139
161
  is_completion_method: bool = False,
140
162
  ):
141
163
  @wraps(orig_method)
142
- def patched_sync_openai_method(
143
- metrics: Optional[List[BaseMetric]] = None,
144
- context: Optional[List[str]] = None,
145
- retrieval_context: Optional[List[str]] = None,
146
- expected_output: Optional[str] = None,
147
- expected_tools: Optional[List[ToolCall]] = None,
148
- *args,
149
- **kwargs
150
- ):
164
+ def patched_sync_openai_method(*args, **kwargs):
151
165
  input_parameters: InputParameters = extract_input_parameters(
152
166
  is_completion_method, kwargs
153
167
  )
154
- is_traced = len(trace_manager.traces) > 0
155
-
156
- if is_traced:
157
-
158
- @observe(type="llm", model=input_parameters.model, metrics=metrics)
159
- def llm_generation(*args, **kwargs):
160
- response = orig_method(*args, **kwargs)
161
- output_parameters = extract_output_parameters(
162
- is_completion_method, response, input_parameters
163
- )
164
- update_current_span(
165
- input=input_parameters.input
166
- or input_parameters.messages
167
- or "NA",
168
- output=output_parameters.output or "NA",
169
- expected_output=expected_output,
170
- retrieval_context=retrieval_context,
171
- context=context,
172
- tools_called=output_parameters.tools_called,
173
- expected_tools=expected_tools,
174
- )
175
- update_llm_span(
176
- input_token_count=output_parameters.prompt_tokens,
177
- output_token_count=output_parameters.completion_tokens,
178
- )
179
- create_child_tool_spans(output_parameters)
180
- return response
181
-
182
- return llm_generation(*args, **kwargs)
183
- else:
168
+
169
+ llm_context = current_llm_context.get()
170
+
171
+ @observe(
172
+ type="llm",
173
+ model=input_parameters.model,
174
+ metrics=llm_context.metrics,
175
+ metric_collection=llm_context.metric_collection,
176
+ )
177
+ def llm_generation(*args, **kwargs):
184
178
  response = orig_method(*args, **kwargs)
185
179
  output_parameters = extract_output_parameters(
186
180
  is_completion_method, response, input_parameters
187
181
  )
188
- test_case = LLMTestCase(
189
- input=input_parameters.input,
190
- actual_output=output_parameters.output,
191
- expected_output=expected_output,
192
- retrieval_context=retrieval_context,
193
- context=context,
194
- tools_called=output_parameters.tools_called,
195
- expected_tools=expected_tools,
196
- )
197
- add_test_case(
198
- test_case=test_case,
199
- metrics=metrics,
200
- input_parameters=input_parameters,
182
+ _update_all_attributes(
183
+ input_parameters,
184
+ output_parameters,
185
+ llm_context.expected_tools,
186
+ llm_context.expected_output,
187
+ llm_context.context,
188
+ llm_context.retrieval_context,
201
189
  )
190
+
202
191
  return response
203
192
 
193
+ return llm_generation(*args, **kwargs)
194
+
204
195
  return patched_sync_openai_method
196
+
197
+
198
+ def _update_all_attributes(
199
+ input_parameters: InputParameters,
200
+ output_parameters: OutputParameters,
201
+ expected_tools: List[ToolCall],
202
+ expected_output: str,
203
+ context: List[str],
204
+ retrieval_context: List[str],
205
+ ):
206
+ """Update span and trace attributes with input/output parameters."""
207
+ update_current_span(
208
+ input=input_parameters.input or input_parameters.messages or "NA",
209
+ output=output_parameters.output or "NA",
210
+ tools_called=output_parameters.tools_called,
211
+ # attributes to be added
212
+ expected_output=expected_output,
213
+ expected_tools=expected_tools,
214
+ context=context,
215
+ retrieval_context=retrieval_context,
216
+ )
217
+
218
+ llm_context = current_llm_context.get()
219
+
220
+ update_llm_span(
221
+ input_token_count=output_parameters.prompt_tokens,
222
+ output_token_count=output_parameters.completion_tokens,
223
+ prompt=llm_context.prompt,
224
+ )
225
+
226
+ if output_parameters.tools_called:
227
+ create_child_tool_spans(output_parameters)
228
+
229
+ __update_input_and_output_of_current_trace(
230
+ input_parameters, output_parameters
231
+ )
232
+
233
+
234
+ def __update_input_and_output_of_current_trace(
235
+ input_parameters: InputParameters, output_parameters: OutputParameters
236
+ ):
237
+
238
+ current_trace = current_trace_context.get()
239
+ if current_trace:
240
+ if current_trace.input is None:
241
+ current_trace.input = (
242
+ input_parameters.input or input_parameters.messages
243
+ )
244
+
245
+ if current_trace.output is None:
246
+ current_trace.output = output_parameters.output
247
+
248
+ return
249
+
250
+
251
+ def unpatch_openai_classes():
252
+ """Restore OpenAI resource classes to their original state."""
253
+ global _OPENAI_PATCHED
254
+
255
+ # If not patched, nothing to do
256
+ if not _OPENAI_PATCHED:
257
+ return
258
+
259
+ try:
260
+ from openai.resources.chat.completions import (
261
+ Completions,
262
+ AsyncCompletions,
263
+ )
264
+
265
+ # Restore original methods for Completions
266
+ if "Completions.create" in _ORIGINAL_METHODS:
267
+ Completions.create = _ORIGINAL_METHODS["Completions.create"]
268
+
269
+ if "Completions.parse" in _ORIGINAL_METHODS:
270
+ Completions.parse = _ORIGINAL_METHODS["Completions.parse"]
271
+
272
+ # Restore original methods for AsyncCompletions
273
+ if "AsyncCompletions.create" in _ORIGINAL_METHODS:
274
+ AsyncCompletions.create = _ORIGINAL_METHODS[
275
+ "AsyncCompletions.create"
276
+ ]
277
+
278
+ if "AsyncCompletions.parse" in _ORIGINAL_METHODS:
279
+ AsyncCompletions.parse = _ORIGINAL_METHODS["AsyncCompletions.parse"]
280
+
281
+ except ImportError:
282
+ pass
283
+
284
+ try:
285
+ from openai.resources.responses import Responses, AsyncResponses
286
+
287
+ # Restore original methods for Responses
288
+ if "Responses.create" in _ORIGINAL_METHODS:
289
+ Responses.create = _ORIGINAL_METHODS["Responses.create"]
290
+
291
+ # Restore original methods for AsyncResponses
292
+ if "AsyncResponses.create" in _ORIGINAL_METHODS:
293
+ AsyncResponses.create = _ORIGINAL_METHODS["AsyncResponses.create"]
294
+
295
+ except ImportError:
296
+ pass
297
+
298
+ # Reset the patched flag
299
+ _OPENAI_PATCHED = False
@@ -0,0 +1,20 @@
1
+ from typing import Any, Optional, List, Dict
2
+ from pydantic import BaseModel
3
+
4
+ from deepeval.test_case.llm_test_case import ToolCall
5
+
6
+
7
+ class InputParameters(BaseModel):
8
+ model: Optional[str] = None
9
+ input: Optional[str] = None
10
+ tools: Optional[List[Dict[str, Any]]] = None
11
+ instructions: Optional[str] = None
12
+ messages: Optional[List[Dict[str, Any]]] = None
13
+ tool_descriptions: Optional[Dict[str, str]] = None
14
+
15
+
16
+ class OutputParameters(BaseModel):
17
+ output: Optional[str] = None
18
+ prompt_tokens: Optional[int] = None
19
+ completion_tokens: Optional[int] = None
20
+ tools_called: Optional[List[ToolCall]] = None