deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/openai/patch.py
CHANGED
|
@@ -1,204 +1,295 @@
|
|
|
1
|
-
from typing import Callable, List
|
|
1
|
+
from typing import Callable, List
|
|
2
2
|
from functools import wraps
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
get_attr_path,
|
|
6
|
-
set_attr_path,
|
|
7
|
-
add_test_case,
|
|
8
|
-
create_child_tool_spans,
|
|
9
|
-
)
|
|
4
|
+
|
|
10
5
|
from deepeval.openai.extractors import (
|
|
11
|
-
|
|
12
|
-
|
|
6
|
+
safe_extract_output_parameters,
|
|
7
|
+
safe_extract_input_parameters,
|
|
13
8
|
InputParameters,
|
|
14
|
-
|
|
9
|
+
OutputParameters,
|
|
10
|
+
)
|
|
11
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
12
|
+
from deepeval.tracing.context import (
|
|
13
|
+
current_trace_context,
|
|
14
|
+
update_current_span,
|
|
15
|
+
update_llm_span,
|
|
15
16
|
)
|
|
16
|
-
from deepeval.tracing
|
|
17
|
-
from deepeval.tracing import
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
from deepeval.tracing import observe
|
|
18
|
+
from deepeval.tracing.trace_context import current_llm_context
|
|
19
|
+
|
|
20
|
+
# Store original methods for safety and potential unpatching
|
|
21
|
+
_ORIGINAL_METHODS = {}
|
|
22
|
+
_OPENAI_PATCHED = False
|
|
23
|
+
|
|
20
24
|
|
|
25
|
+
def patch_openai_classes():
|
|
26
|
+
"""Monkey patch OpenAI resource classes directly."""
|
|
27
|
+
global _OPENAI_PATCHED
|
|
21
28
|
|
|
22
|
-
|
|
23
|
-
if
|
|
29
|
+
# Single guard - if already patched, return immediately
|
|
30
|
+
if _OPENAI_PATCHED:
|
|
24
31
|
return
|
|
25
32
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
33
|
+
try:
|
|
34
|
+
from openai.resources.chat.completions import (
|
|
35
|
+
Completions,
|
|
36
|
+
AsyncCompletions,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Store original methods before patching
|
|
40
|
+
if hasattr(Completions, "create"):
|
|
41
|
+
_ORIGINAL_METHODS["Completions.create"] = Completions.create
|
|
42
|
+
Completions.create = _create_sync_wrapper(
|
|
43
|
+
Completions.create, is_completion_method=True
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if hasattr(Completions, "parse"):
|
|
47
|
+
_ORIGINAL_METHODS["Completions.parse"] = Completions.parse
|
|
48
|
+
Completions.parse = _create_sync_wrapper(
|
|
49
|
+
Completions.parse, is_completion_method=True
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if hasattr(AsyncCompletions, "create"):
|
|
53
|
+
_ORIGINAL_METHODS["AsyncCompletions.create"] = (
|
|
54
|
+
AsyncCompletions.create
|
|
55
|
+
)
|
|
56
|
+
AsyncCompletions.create = _create_async_wrapper(
|
|
57
|
+
AsyncCompletions.create, is_completion_method=True
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if hasattr(AsyncCompletions, "parse"):
|
|
61
|
+
_ORIGINAL_METHODS["AsyncCompletions.parse"] = AsyncCompletions.parse
|
|
62
|
+
AsyncCompletions.parse = _create_async_wrapper(
|
|
63
|
+
AsyncCompletions.parse, is_completion_method=True
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
except ImportError:
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
from openai.resources.responses import Responses, AsyncResponses
|
|
71
|
+
|
|
72
|
+
if hasattr(Responses, "create"):
|
|
73
|
+
_ORIGINAL_METHODS["Responses.create"] = Responses.create
|
|
74
|
+
Responses.create = _create_sync_wrapper(
|
|
75
|
+
Responses.create, is_completion_method=False
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if hasattr(AsyncResponses, "create"):
|
|
79
|
+
_ORIGINAL_METHODS["AsyncResponses.create"] = AsyncResponses.create
|
|
80
|
+
AsyncResponses.create = _create_async_wrapper(
|
|
81
|
+
AsyncResponses.create, is_completion_method=False
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
except ImportError:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
# Set flag at the END after successful patching
|
|
88
|
+
_OPENAI_PATCHED = True
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _create_sync_wrapper(original_method, is_completion_method: bool):
|
|
92
|
+
"""Create a wrapper for sync methods - called ONCE during patching."""
|
|
93
|
+
|
|
94
|
+
@wraps(original_method)
|
|
95
|
+
def method_wrapper(self, *args, **kwargs):
|
|
96
|
+
bound_method = original_method.__get__(self, type(self))
|
|
97
|
+
patched = _patch_sync_openai_client_method(
|
|
98
|
+
orig_method=bound_method, is_completion_method=is_completion_method
|
|
99
|
+
)
|
|
100
|
+
return patched(*args, **kwargs)
|
|
101
|
+
|
|
102
|
+
return method_wrapper
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _create_async_wrapper(original_method, is_completion_method: bool):
|
|
106
|
+
"""Create a wrapper for async methods - called ONCE during patching."""
|
|
107
|
+
|
|
108
|
+
@wraps(original_method)
|
|
109
|
+
async def method_wrapper(self, *args, **kwargs):
|
|
110
|
+
bound_method = original_method.__get__(self, type(self))
|
|
111
|
+
patched = _patch_async_openai_client_method(
|
|
112
|
+
orig_method=bound_method, is_completion_method=is_completion_method
|
|
113
|
+
)
|
|
114
|
+
return await patched(*args, **kwargs)
|
|
115
|
+
|
|
116
|
+
return method_wrapper
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _patch_async_openai_client_method(
|
|
68
120
|
orig_method: Callable,
|
|
69
121
|
is_completion_method: bool = False,
|
|
70
122
|
):
|
|
71
123
|
@wraps(orig_method)
|
|
72
|
-
async def patched_async_openai_method(
|
|
73
|
-
|
|
74
|
-
context: Optional[List[str]] = None,
|
|
75
|
-
retrieval_context: Optional[List[str]] = None,
|
|
76
|
-
expected_output: Optional[str] = None,
|
|
77
|
-
expected_tools: Optional[List[ToolCall]] = None,
|
|
78
|
-
*args,
|
|
79
|
-
**kwargs
|
|
80
|
-
):
|
|
81
|
-
input_parameters: InputParameters = extract_input_parameters(
|
|
124
|
+
async def patched_async_openai_method(*args, **kwargs):
|
|
125
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
82
126
|
is_completion_method, kwargs
|
|
83
127
|
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
update_current_span(
|
|
95
|
-
input=input_parameters.input
|
|
96
|
-
or input_parameters.messages
|
|
97
|
-
or "NA",
|
|
98
|
-
output=output_parameters.output or "NA",
|
|
99
|
-
expected_output=expected_output,
|
|
100
|
-
retrieval_context=retrieval_context,
|
|
101
|
-
context=context,
|
|
102
|
-
tools_called=output_parameters.tools_called,
|
|
103
|
-
expected_tools=expected_tools,
|
|
104
|
-
)
|
|
105
|
-
update_llm_span(
|
|
106
|
-
input_token_count=output_parameters.prompt_tokens,
|
|
107
|
-
output_token_count=output_parameters.completion_tokens,
|
|
108
|
-
)
|
|
109
|
-
create_child_tool_spans(output_parameters)
|
|
110
|
-
return response
|
|
111
|
-
|
|
112
|
-
return await llm_generation(*args, **kwargs)
|
|
113
|
-
else:
|
|
128
|
+
|
|
129
|
+
llm_context = current_llm_context.get()
|
|
130
|
+
|
|
131
|
+
@observe(
|
|
132
|
+
type="llm",
|
|
133
|
+
model=input_parameters.model,
|
|
134
|
+
metrics=llm_context.metrics,
|
|
135
|
+
metric_collection=llm_context.metric_collection,
|
|
136
|
+
)
|
|
137
|
+
async def llm_generation(*args, **kwargs):
|
|
114
138
|
response = await orig_method(*args, **kwargs)
|
|
115
|
-
output_parameters =
|
|
139
|
+
output_parameters = safe_extract_output_parameters(
|
|
116
140
|
is_completion_method, response, input_parameters
|
|
117
141
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
context
|
|
124
|
-
|
|
125
|
-
expected_tools=expected_tools,
|
|
126
|
-
)
|
|
127
|
-
add_test_case(
|
|
128
|
-
test_case=test_case,
|
|
129
|
-
metrics=metrics,
|
|
130
|
-
input_parameters=input_parameters,
|
|
142
|
+
_update_all_attributes(
|
|
143
|
+
input_parameters,
|
|
144
|
+
output_parameters,
|
|
145
|
+
llm_context.expected_tools,
|
|
146
|
+
llm_context.expected_output,
|
|
147
|
+
llm_context.context,
|
|
148
|
+
llm_context.retrieval_context,
|
|
131
149
|
)
|
|
150
|
+
|
|
132
151
|
return response
|
|
133
152
|
|
|
153
|
+
return await llm_generation(*args, **kwargs)
|
|
154
|
+
|
|
134
155
|
return patched_async_openai_method
|
|
135
156
|
|
|
136
157
|
|
|
137
|
-
def
|
|
158
|
+
def _patch_sync_openai_client_method(
|
|
138
159
|
orig_method: Callable,
|
|
139
160
|
is_completion_method: bool = False,
|
|
140
161
|
):
|
|
141
162
|
@wraps(orig_method)
|
|
142
|
-
def patched_sync_openai_method(
|
|
143
|
-
|
|
144
|
-
context: Optional[List[str]] = None,
|
|
145
|
-
retrieval_context: Optional[List[str]] = None,
|
|
146
|
-
expected_output: Optional[str] = None,
|
|
147
|
-
expected_tools: Optional[List[ToolCall]] = None,
|
|
148
|
-
*args,
|
|
149
|
-
**kwargs
|
|
150
|
-
):
|
|
151
|
-
input_parameters: InputParameters = extract_input_parameters(
|
|
163
|
+
def patched_sync_openai_method(*args, **kwargs):
|
|
164
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
152
165
|
is_completion_method, kwargs
|
|
153
166
|
)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
update_current_span(
|
|
165
|
-
input=input_parameters.input
|
|
166
|
-
or input_parameters.messages
|
|
167
|
-
or "NA",
|
|
168
|
-
output=output_parameters.output or "NA",
|
|
169
|
-
expected_output=expected_output,
|
|
170
|
-
retrieval_context=retrieval_context,
|
|
171
|
-
context=context,
|
|
172
|
-
tools_called=output_parameters.tools_called,
|
|
173
|
-
expected_tools=expected_tools,
|
|
174
|
-
)
|
|
175
|
-
update_llm_span(
|
|
176
|
-
input_token_count=output_parameters.prompt_tokens,
|
|
177
|
-
output_token_count=output_parameters.completion_tokens,
|
|
178
|
-
)
|
|
179
|
-
create_child_tool_spans(output_parameters)
|
|
180
|
-
return response
|
|
181
|
-
|
|
182
|
-
return llm_generation(*args, **kwargs)
|
|
183
|
-
else:
|
|
167
|
+
|
|
168
|
+
llm_context = current_llm_context.get()
|
|
169
|
+
|
|
170
|
+
@observe(
|
|
171
|
+
type="llm",
|
|
172
|
+
model=input_parameters.model,
|
|
173
|
+
metrics=llm_context.metrics,
|
|
174
|
+
metric_collection=llm_context.metric_collection,
|
|
175
|
+
)
|
|
176
|
+
def llm_generation(*args, **kwargs):
|
|
184
177
|
response = orig_method(*args, **kwargs)
|
|
185
|
-
output_parameters =
|
|
178
|
+
output_parameters = safe_extract_output_parameters(
|
|
186
179
|
is_completion_method, response, input_parameters
|
|
187
180
|
)
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
context
|
|
194
|
-
|
|
195
|
-
expected_tools=expected_tools,
|
|
196
|
-
)
|
|
197
|
-
add_test_case(
|
|
198
|
-
test_case=test_case,
|
|
199
|
-
metrics=metrics,
|
|
200
|
-
input_parameters=input_parameters,
|
|
181
|
+
_update_all_attributes(
|
|
182
|
+
input_parameters,
|
|
183
|
+
output_parameters,
|
|
184
|
+
llm_context.expected_tools,
|
|
185
|
+
llm_context.expected_output,
|
|
186
|
+
llm_context.context,
|
|
187
|
+
llm_context.retrieval_context,
|
|
201
188
|
)
|
|
189
|
+
|
|
202
190
|
return response
|
|
203
191
|
|
|
192
|
+
return llm_generation(*args, **kwargs)
|
|
193
|
+
|
|
204
194
|
return patched_sync_openai_method
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _update_all_attributes(
|
|
198
|
+
input_parameters: InputParameters,
|
|
199
|
+
output_parameters: OutputParameters,
|
|
200
|
+
expected_tools: List[ToolCall],
|
|
201
|
+
expected_output: str,
|
|
202
|
+
context: List[str],
|
|
203
|
+
retrieval_context: List[str],
|
|
204
|
+
):
|
|
205
|
+
"""Update span and trace attributes with input/output parameters."""
|
|
206
|
+
update_current_span(
|
|
207
|
+
input=input_parameters.messages,
|
|
208
|
+
output=output_parameters.output or output_parameters.tools_called,
|
|
209
|
+
tools_called=output_parameters.tools_called,
|
|
210
|
+
# attributes to be added
|
|
211
|
+
expected_output=expected_output,
|
|
212
|
+
expected_tools=expected_tools,
|
|
213
|
+
context=context,
|
|
214
|
+
retrieval_context=retrieval_context,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
llm_context = current_llm_context.get()
|
|
218
|
+
|
|
219
|
+
update_llm_span(
|
|
220
|
+
input_token_count=output_parameters.prompt_tokens,
|
|
221
|
+
output_token_count=output_parameters.completion_tokens,
|
|
222
|
+
prompt=llm_context.prompt,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
__update_input_and_output_of_current_trace(
|
|
226
|
+
input_parameters, output_parameters
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def __update_input_and_output_of_current_trace(
|
|
231
|
+
input_parameters: InputParameters, output_parameters: OutputParameters
|
|
232
|
+
):
|
|
233
|
+
|
|
234
|
+
current_trace = current_trace_context.get()
|
|
235
|
+
if current_trace:
|
|
236
|
+
if current_trace.input is None:
|
|
237
|
+
current_trace.input = (
|
|
238
|
+
input_parameters.input or input_parameters.messages
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if current_trace.output is None:
|
|
242
|
+
current_trace.output = output_parameters.output
|
|
243
|
+
|
|
244
|
+
return
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def unpatch_openai_classes():
|
|
248
|
+
"""Restore OpenAI resource classes to their original state."""
|
|
249
|
+
global _OPENAI_PATCHED
|
|
250
|
+
|
|
251
|
+
# If not patched, nothing to do
|
|
252
|
+
if not _OPENAI_PATCHED:
|
|
253
|
+
return
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
from openai.resources.chat.completions import (
|
|
257
|
+
Completions,
|
|
258
|
+
AsyncCompletions,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Restore original methods for Completions
|
|
262
|
+
if "Completions.create" in _ORIGINAL_METHODS:
|
|
263
|
+
Completions.create = _ORIGINAL_METHODS["Completions.create"]
|
|
264
|
+
|
|
265
|
+
if "Completions.parse" in _ORIGINAL_METHODS:
|
|
266
|
+
Completions.parse = _ORIGINAL_METHODS["Completions.parse"]
|
|
267
|
+
|
|
268
|
+
# Restore original methods for AsyncCompletions
|
|
269
|
+
if "AsyncCompletions.create" in _ORIGINAL_METHODS:
|
|
270
|
+
AsyncCompletions.create = _ORIGINAL_METHODS[
|
|
271
|
+
"AsyncCompletions.create"
|
|
272
|
+
]
|
|
273
|
+
|
|
274
|
+
if "AsyncCompletions.parse" in _ORIGINAL_METHODS:
|
|
275
|
+
AsyncCompletions.parse = _ORIGINAL_METHODS["AsyncCompletions.parse"]
|
|
276
|
+
|
|
277
|
+
except ImportError:
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
from openai.resources.responses import Responses, AsyncResponses
|
|
282
|
+
|
|
283
|
+
# Restore original methods for Responses
|
|
284
|
+
if "Responses.create" in _ORIGINAL_METHODS:
|
|
285
|
+
Responses.create = _ORIGINAL_METHODS["Responses.create"]
|
|
286
|
+
|
|
287
|
+
# Restore original methods for AsyncResponses
|
|
288
|
+
if "AsyncResponses.create" in _ORIGINAL_METHODS:
|
|
289
|
+
AsyncResponses.create = _ORIGINAL_METHODS["AsyncResponses.create"]
|
|
290
|
+
|
|
291
|
+
except ImportError:
|
|
292
|
+
pass
|
|
293
|
+
|
|
294
|
+
# Reset the patched flag
|
|
295
|
+
_OPENAI_PATCHED = False
|
deepeval/openai/types.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Any, Optional, List, Dict
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class InputParameters(BaseModel):
|
|
8
|
+
model: Optional[str] = None
|
|
9
|
+
input: Optional[str] = None
|
|
10
|
+
tools: Optional[List[Dict[str, Any]]] = None
|
|
11
|
+
instructions: Optional[str] = None
|
|
12
|
+
messages: Optional[List[Dict[str, Any]]] = None
|
|
13
|
+
tool_descriptions: Optional[Dict[str, str]] = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OutputParameters(BaseModel):
|
|
17
|
+
output: Optional[Any] = None
|
|
18
|
+
prompt_tokens: Optional[int] = None
|
|
19
|
+
completion_tokens: Optional[int] = None
|
|
20
|
+
tools_called: Optional[List[ToolCall]] = None
|