deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/openai/extractors.py
CHANGED
|
@@ -1,29 +1,15 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from openai.types.chat import ChatCompletion, ParsedChatCompletion
|
|
2
|
-
from typing import
|
|
3
|
+
from typing import Any, Union, Dict
|
|
3
4
|
from openai.types.responses import Response
|
|
4
|
-
from pydantic import BaseModel
|
|
5
|
-
import json
|
|
6
5
|
|
|
7
6
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class InputParameters(BaseModel):
|
|
11
|
-
model: Optional[str] = None
|
|
12
|
-
input: Optional[str] = None
|
|
13
|
-
instructions: Optional[str] = None
|
|
14
|
-
messages: Optional[List[Dict]] = None
|
|
15
|
-
tool_descriptions: Optional[Dict[str, str]] = None
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class OutputParameters(BaseModel):
|
|
19
|
-
output: Optional[str] = None
|
|
20
|
-
prompt_tokens: Optional[int] = None
|
|
21
|
-
completion_tokens: Optional[int] = None
|
|
22
|
-
tools_called: Optional[List[ToolCall]] = None
|
|
7
|
+
from deepeval.openai.utils import stringify_multimodal_content
|
|
8
|
+
from deepeval.openai.types import InputParameters, OutputParameters
|
|
23
9
|
|
|
24
10
|
|
|
25
11
|
def extract_input_parameters(
|
|
26
|
-
is_completion: bool, kwargs: Dict
|
|
12
|
+
is_completion: bool, kwargs: Dict[str, Any]
|
|
27
13
|
) -> InputParameters:
|
|
28
14
|
if is_completion:
|
|
29
15
|
return extract_input_parameters_from_completion(kwargs)
|
|
@@ -31,9 +17,11 @@ def extract_input_parameters(
|
|
|
31
17
|
return extract_input_parameters_from_response(kwargs)
|
|
32
18
|
|
|
33
19
|
|
|
34
|
-
def extract_input_parameters_from_completion(
|
|
20
|
+
def extract_input_parameters_from_completion(
|
|
21
|
+
kwargs: Dict[str, Any],
|
|
22
|
+
) -> InputParameters:
|
|
35
23
|
model = kwargs.get("model")
|
|
36
|
-
messages = kwargs.get("messages")
|
|
24
|
+
messages = kwargs.get("messages") or []
|
|
37
25
|
tools = kwargs.get("tools")
|
|
38
26
|
tool_descriptions_map = (
|
|
39
27
|
{
|
|
@@ -45,7 +33,7 @@ def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
|
|
|
45
33
|
)
|
|
46
34
|
|
|
47
35
|
# extract first user input from messages
|
|
48
|
-
|
|
36
|
+
input_arg = ""
|
|
49
37
|
user_messages = []
|
|
50
38
|
for message in messages:
|
|
51
39
|
role = message["role"]
|
|
@@ -53,20 +41,22 @@ def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
|
|
|
53
41
|
if role == "user":
|
|
54
42
|
user_messages.append(content)
|
|
55
43
|
if len(user_messages) > 0:
|
|
56
|
-
|
|
44
|
+
input_arg = user_messages[0]
|
|
57
45
|
|
|
58
46
|
return InputParameters(
|
|
59
47
|
model=model,
|
|
60
|
-
input=
|
|
48
|
+
input=stringify_multimodal_content(input_arg),
|
|
61
49
|
messages=messages,
|
|
62
50
|
tools=tools,
|
|
63
51
|
tool_descriptions=tool_descriptions_map,
|
|
64
52
|
)
|
|
65
53
|
|
|
66
54
|
|
|
67
|
-
def extract_input_parameters_from_response(
|
|
55
|
+
def extract_input_parameters_from_response(
|
|
56
|
+
kwargs: Dict[str, Any],
|
|
57
|
+
) -> InputParameters:
|
|
68
58
|
model = kwargs.get("model")
|
|
69
|
-
|
|
59
|
+
input_payload = kwargs.get("input")
|
|
70
60
|
instructions = kwargs.get("instructions")
|
|
71
61
|
tools = kwargs.get("tools")
|
|
72
62
|
tool_descriptions = (
|
|
@@ -74,9 +64,11 @@ def extract_input_parameters_from_response(kwargs: Dict) -> InputParameters:
|
|
|
74
64
|
if tools is not None
|
|
75
65
|
else None
|
|
76
66
|
)
|
|
67
|
+
messages = input_payload if isinstance(input_payload, list) else None
|
|
77
68
|
return InputParameters(
|
|
78
69
|
model=model,
|
|
79
|
-
input=
|
|
70
|
+
input=stringify_multimodal_content(input_payload),
|
|
71
|
+
messages=messages,
|
|
80
72
|
instructions=instructions,
|
|
81
73
|
tools=tools,
|
|
82
74
|
tool_descriptions=tool_descriptions,
|
|
@@ -102,7 +94,7 @@ def extract_output_parameters_from_completion(
|
|
|
102
94
|
completion: Union[ChatCompletion, ParsedChatCompletion],
|
|
103
95
|
input_parameters: InputParameters,
|
|
104
96
|
) -> OutputParameters:
|
|
105
|
-
output = str(completion.choices[0].message.content)
|
|
97
|
+
output = str(completion.choices[0].message.content or "")
|
|
106
98
|
prompt_tokens = completion.usage.prompt_tokens
|
|
107
99
|
completion_tokens = completion.usage.completion_tokens
|
|
108
100
|
|
|
@@ -112,13 +104,12 @@ def extract_output_parameters_from_completion(
|
|
|
112
104
|
if openai_tool_calls is not None:
|
|
113
105
|
tools_called = []
|
|
114
106
|
for tool_call in openai_tool_calls:
|
|
107
|
+
tool_descriptions = input_parameters.tool_descriptions or {}
|
|
115
108
|
tools_called.append(
|
|
116
109
|
ToolCall(
|
|
117
110
|
name=tool_call.function.name,
|
|
118
111
|
input_parameters=json.loads(tool_call.function.arguments),
|
|
119
|
-
description=
|
|
120
|
-
tool_call.function.name
|
|
121
|
-
),
|
|
112
|
+
description=tool_descriptions.get(tool_call.function.name),
|
|
122
113
|
)
|
|
123
114
|
)
|
|
124
115
|
|
|
@@ -145,13 +136,12 @@ def extract_output_parameters_from_response(
|
|
|
145
136
|
for tool_call in openai_raw_output:
|
|
146
137
|
if tool_call.type != "function_call":
|
|
147
138
|
continue
|
|
139
|
+
tool_descriptions = input_parameters.tool_descriptions or {}
|
|
148
140
|
tools_called.append(
|
|
149
141
|
ToolCall(
|
|
150
142
|
name=tool_call.name,
|
|
151
143
|
input_parameters=json.loads(tool_call.arguments),
|
|
152
|
-
description=
|
|
153
|
-
tool_call.name
|
|
154
|
-
),
|
|
144
|
+
description=tool_descriptions.get(tool_call.name),
|
|
155
145
|
)
|
|
156
146
|
)
|
|
157
147
|
|
deepeval/openai/patch.py
CHANGED
|
@@ -1,204 +1,299 @@
|
|
|
1
|
-
from typing import Callable, List
|
|
1
|
+
from typing import Callable, List
|
|
2
2
|
from functools import wraps
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
get_attr_path,
|
|
6
|
-
set_attr_path,
|
|
7
|
-
add_test_case,
|
|
8
|
-
create_child_tool_spans,
|
|
9
|
-
)
|
|
4
|
+
|
|
10
5
|
from deepeval.openai.extractors import (
|
|
11
6
|
extract_output_parameters,
|
|
12
7
|
extract_input_parameters,
|
|
13
8
|
InputParameters,
|
|
14
|
-
|
|
9
|
+
OutputParameters,
|
|
10
|
+
)
|
|
11
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
12
|
+
from deepeval.tracing.context import (
|
|
13
|
+
current_trace_context,
|
|
14
|
+
update_current_span,
|
|
15
|
+
update_llm_span,
|
|
15
16
|
)
|
|
16
|
-
from deepeval.tracing
|
|
17
|
-
from deepeval.tracing import
|
|
18
|
-
from deepeval.
|
|
19
|
-
|
|
17
|
+
from deepeval.tracing import observe
|
|
18
|
+
from deepeval.tracing.trace_context import current_llm_context
|
|
19
|
+
from deepeval.openai.utils import create_child_tool_spans
|
|
20
|
+
|
|
21
|
+
# Store original methods for safety and potential unpatching
|
|
22
|
+
_ORIGINAL_METHODS = {}
|
|
23
|
+
_OPENAI_PATCHED = False
|
|
24
|
+
|
|
20
25
|
|
|
26
|
+
def patch_openai_classes():
|
|
27
|
+
"""Monkey patch OpenAI resource classes directly."""
|
|
28
|
+
global _OPENAI_PATCHED
|
|
21
29
|
|
|
22
|
-
|
|
23
|
-
if
|
|
30
|
+
# Single guard - if already patched, return immediately
|
|
31
|
+
if _OPENAI_PATCHED:
|
|
24
32
|
return
|
|
25
33
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
34
|
+
try:
|
|
35
|
+
from openai.resources.chat.completions import (
|
|
36
|
+
Completions,
|
|
37
|
+
AsyncCompletions,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Store original methods before patching
|
|
41
|
+
if hasattr(Completions, "create"):
|
|
42
|
+
_ORIGINAL_METHODS["Completions.create"] = Completions.create
|
|
43
|
+
Completions.create = _create_sync_wrapper(
|
|
44
|
+
Completions.create, is_completion_method=True
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if hasattr(Completions, "parse"):
|
|
48
|
+
_ORIGINAL_METHODS["Completions.parse"] = Completions.parse
|
|
49
|
+
Completions.parse = _create_sync_wrapper(
|
|
50
|
+
Completions.parse, is_completion_method=True
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
if hasattr(AsyncCompletions, "create"):
|
|
54
|
+
_ORIGINAL_METHODS["AsyncCompletions.create"] = (
|
|
55
|
+
AsyncCompletions.create
|
|
56
|
+
)
|
|
57
|
+
AsyncCompletions.create = _create_async_wrapper(
|
|
58
|
+
AsyncCompletions.create, is_completion_method=True
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if hasattr(AsyncCompletions, "parse"):
|
|
62
|
+
_ORIGINAL_METHODS["AsyncCompletions.parse"] = AsyncCompletions.parse
|
|
63
|
+
AsyncCompletions.parse = _create_async_wrapper(
|
|
64
|
+
AsyncCompletions.parse, is_completion_method=True
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
except ImportError:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
from openai.resources.responses import Responses, AsyncResponses
|
|
72
|
+
|
|
73
|
+
if hasattr(Responses, "create"):
|
|
74
|
+
_ORIGINAL_METHODS["Responses.create"] = Responses.create
|
|
75
|
+
Responses.create = _create_sync_wrapper(
|
|
76
|
+
Responses.create, is_completion_method=False
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
if hasattr(AsyncResponses, "create"):
|
|
80
|
+
_ORIGINAL_METHODS["AsyncResponses.create"] = AsyncResponses.create
|
|
81
|
+
AsyncResponses.create = _create_async_wrapper(
|
|
82
|
+
AsyncResponses.create, is_completion_method=False
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
except ImportError:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
# Set flag at the END after successful patching
|
|
89
|
+
_OPENAI_PATCHED = True
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _create_sync_wrapper(original_method, is_completion_method: bool):
|
|
93
|
+
"""Create a wrapper for sync methods - called ONCE during patching."""
|
|
94
|
+
|
|
95
|
+
@wraps(original_method)
|
|
96
|
+
def method_wrapper(self, *args, **kwargs):
|
|
97
|
+
bound_method = original_method.__get__(self, type(self))
|
|
98
|
+
patched = _patch_sync_openai_client_method(
|
|
99
|
+
orig_method=bound_method, is_completion_method=is_completion_method
|
|
100
|
+
)
|
|
101
|
+
return patched(*args, **kwargs)
|
|
102
|
+
|
|
103
|
+
return method_wrapper
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _create_async_wrapper(original_method, is_completion_method: bool):
|
|
107
|
+
"""Create a wrapper for async methods - called ONCE during patching."""
|
|
108
|
+
|
|
109
|
+
@wraps(original_method)
|
|
110
|
+
async def method_wrapper(self, *args, **kwargs):
|
|
111
|
+
bound_method = original_method.__get__(self, type(self))
|
|
112
|
+
patched = _patch_async_openai_client_method(
|
|
113
|
+
orig_method=bound_method, is_completion_method=is_completion_method
|
|
114
|
+
)
|
|
115
|
+
return await patched(*args, **kwargs)
|
|
116
|
+
|
|
117
|
+
return method_wrapper
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _patch_async_openai_client_method(
|
|
68
121
|
orig_method: Callable,
|
|
69
122
|
is_completion_method: bool = False,
|
|
70
123
|
):
|
|
71
124
|
@wraps(orig_method)
|
|
72
|
-
async def patched_async_openai_method(
|
|
73
|
-
metrics: Optional[List[BaseMetric]] = None,
|
|
74
|
-
context: Optional[List[str]] = None,
|
|
75
|
-
retrieval_context: Optional[List[str]] = None,
|
|
76
|
-
expected_output: Optional[str] = None,
|
|
77
|
-
expected_tools: Optional[List[ToolCall]] = None,
|
|
78
|
-
*args,
|
|
79
|
-
**kwargs
|
|
80
|
-
):
|
|
125
|
+
async def patched_async_openai_method(*args, **kwargs):
|
|
81
126
|
input_parameters: InputParameters = extract_input_parameters(
|
|
82
127
|
is_completion_method, kwargs
|
|
83
128
|
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
update_current_span(
|
|
95
|
-
input=input_parameters.input
|
|
96
|
-
or input_parameters.messages
|
|
97
|
-
or "NA",
|
|
98
|
-
output=output_parameters.output or "NA",
|
|
99
|
-
expected_output=expected_output,
|
|
100
|
-
retrieval_context=retrieval_context,
|
|
101
|
-
context=context,
|
|
102
|
-
tools_called=output_parameters.tools_called,
|
|
103
|
-
expected_tools=expected_tools,
|
|
104
|
-
)
|
|
105
|
-
update_llm_span(
|
|
106
|
-
input_token_count=output_parameters.prompt_tokens,
|
|
107
|
-
output_token_count=output_parameters.completion_tokens,
|
|
108
|
-
)
|
|
109
|
-
create_child_tool_spans(output_parameters)
|
|
110
|
-
return response
|
|
111
|
-
|
|
112
|
-
return await llm_generation(*args, **kwargs)
|
|
113
|
-
else:
|
|
129
|
+
|
|
130
|
+
llm_context = current_llm_context.get()
|
|
131
|
+
|
|
132
|
+
@observe(
|
|
133
|
+
type="llm",
|
|
134
|
+
model=input_parameters.model,
|
|
135
|
+
metrics=llm_context.metrics,
|
|
136
|
+
metric_collection=llm_context.metric_collection,
|
|
137
|
+
)
|
|
138
|
+
async def llm_generation(*args, **kwargs):
|
|
114
139
|
response = await orig_method(*args, **kwargs)
|
|
115
140
|
output_parameters = extract_output_parameters(
|
|
116
141
|
is_completion_method, response, input_parameters
|
|
117
142
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
context
|
|
124
|
-
|
|
125
|
-
expected_tools=expected_tools,
|
|
126
|
-
)
|
|
127
|
-
add_test_case(
|
|
128
|
-
test_case=test_case,
|
|
129
|
-
metrics=metrics,
|
|
130
|
-
input_parameters=input_parameters,
|
|
143
|
+
_update_all_attributes(
|
|
144
|
+
input_parameters,
|
|
145
|
+
output_parameters,
|
|
146
|
+
llm_context.expected_tools,
|
|
147
|
+
llm_context.expected_output,
|
|
148
|
+
llm_context.context,
|
|
149
|
+
llm_context.retrieval_context,
|
|
131
150
|
)
|
|
151
|
+
|
|
132
152
|
return response
|
|
133
153
|
|
|
154
|
+
return await llm_generation(*args, **kwargs)
|
|
155
|
+
|
|
134
156
|
return patched_async_openai_method
|
|
135
157
|
|
|
136
158
|
|
|
137
|
-
def
|
|
159
|
+
def _patch_sync_openai_client_method(
|
|
138
160
|
orig_method: Callable,
|
|
139
161
|
is_completion_method: bool = False,
|
|
140
162
|
):
|
|
141
163
|
@wraps(orig_method)
|
|
142
|
-
def patched_sync_openai_method(
|
|
143
|
-
metrics: Optional[List[BaseMetric]] = None,
|
|
144
|
-
context: Optional[List[str]] = None,
|
|
145
|
-
retrieval_context: Optional[List[str]] = None,
|
|
146
|
-
expected_output: Optional[str] = None,
|
|
147
|
-
expected_tools: Optional[List[ToolCall]] = None,
|
|
148
|
-
*args,
|
|
149
|
-
**kwargs
|
|
150
|
-
):
|
|
164
|
+
def patched_sync_openai_method(*args, **kwargs):
|
|
151
165
|
input_parameters: InputParameters = extract_input_parameters(
|
|
152
166
|
is_completion_method, kwargs
|
|
153
167
|
)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
update_current_span(
|
|
165
|
-
input=input_parameters.input
|
|
166
|
-
or input_parameters.messages
|
|
167
|
-
or "NA",
|
|
168
|
-
output=output_parameters.output or "NA",
|
|
169
|
-
expected_output=expected_output,
|
|
170
|
-
retrieval_context=retrieval_context,
|
|
171
|
-
context=context,
|
|
172
|
-
tools_called=output_parameters.tools_called,
|
|
173
|
-
expected_tools=expected_tools,
|
|
174
|
-
)
|
|
175
|
-
update_llm_span(
|
|
176
|
-
input_token_count=output_parameters.prompt_tokens,
|
|
177
|
-
output_token_count=output_parameters.completion_tokens,
|
|
178
|
-
)
|
|
179
|
-
create_child_tool_spans(output_parameters)
|
|
180
|
-
return response
|
|
181
|
-
|
|
182
|
-
return llm_generation(*args, **kwargs)
|
|
183
|
-
else:
|
|
168
|
+
|
|
169
|
+
llm_context = current_llm_context.get()
|
|
170
|
+
|
|
171
|
+
@observe(
|
|
172
|
+
type="llm",
|
|
173
|
+
model=input_parameters.model,
|
|
174
|
+
metrics=llm_context.metrics,
|
|
175
|
+
metric_collection=llm_context.metric_collection,
|
|
176
|
+
)
|
|
177
|
+
def llm_generation(*args, **kwargs):
|
|
184
178
|
response = orig_method(*args, **kwargs)
|
|
185
179
|
output_parameters = extract_output_parameters(
|
|
186
180
|
is_completion_method, response, input_parameters
|
|
187
181
|
)
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
context
|
|
194
|
-
|
|
195
|
-
expected_tools=expected_tools,
|
|
196
|
-
)
|
|
197
|
-
add_test_case(
|
|
198
|
-
test_case=test_case,
|
|
199
|
-
metrics=metrics,
|
|
200
|
-
input_parameters=input_parameters,
|
|
182
|
+
_update_all_attributes(
|
|
183
|
+
input_parameters,
|
|
184
|
+
output_parameters,
|
|
185
|
+
llm_context.expected_tools,
|
|
186
|
+
llm_context.expected_output,
|
|
187
|
+
llm_context.context,
|
|
188
|
+
llm_context.retrieval_context,
|
|
201
189
|
)
|
|
190
|
+
|
|
202
191
|
return response
|
|
203
192
|
|
|
193
|
+
return llm_generation(*args, **kwargs)
|
|
194
|
+
|
|
204
195
|
return patched_sync_openai_method
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _update_all_attributes(
|
|
199
|
+
input_parameters: InputParameters,
|
|
200
|
+
output_parameters: OutputParameters,
|
|
201
|
+
expected_tools: List[ToolCall],
|
|
202
|
+
expected_output: str,
|
|
203
|
+
context: List[str],
|
|
204
|
+
retrieval_context: List[str],
|
|
205
|
+
):
|
|
206
|
+
"""Update span and trace attributes with input/output parameters."""
|
|
207
|
+
update_current_span(
|
|
208
|
+
input=input_parameters.input or input_parameters.messages or "NA",
|
|
209
|
+
output=output_parameters.output or "NA",
|
|
210
|
+
tools_called=output_parameters.tools_called,
|
|
211
|
+
# attributes to be added
|
|
212
|
+
expected_output=expected_output,
|
|
213
|
+
expected_tools=expected_tools,
|
|
214
|
+
context=context,
|
|
215
|
+
retrieval_context=retrieval_context,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
llm_context = current_llm_context.get()
|
|
219
|
+
|
|
220
|
+
update_llm_span(
|
|
221
|
+
input_token_count=output_parameters.prompt_tokens,
|
|
222
|
+
output_token_count=output_parameters.completion_tokens,
|
|
223
|
+
prompt=llm_context.prompt,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if output_parameters.tools_called:
|
|
227
|
+
create_child_tool_spans(output_parameters)
|
|
228
|
+
|
|
229
|
+
__update_input_and_output_of_current_trace(
|
|
230
|
+
input_parameters, output_parameters
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def __update_input_and_output_of_current_trace(
|
|
235
|
+
input_parameters: InputParameters, output_parameters: OutputParameters
|
|
236
|
+
):
|
|
237
|
+
|
|
238
|
+
current_trace = current_trace_context.get()
|
|
239
|
+
if current_trace:
|
|
240
|
+
if current_trace.input is None:
|
|
241
|
+
current_trace.input = (
|
|
242
|
+
input_parameters.input or input_parameters.messages
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if current_trace.output is None:
|
|
246
|
+
current_trace.output = output_parameters.output
|
|
247
|
+
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def unpatch_openai_classes():
|
|
252
|
+
"""Restore OpenAI resource classes to their original state."""
|
|
253
|
+
global _OPENAI_PATCHED
|
|
254
|
+
|
|
255
|
+
# If not patched, nothing to do
|
|
256
|
+
if not _OPENAI_PATCHED:
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
from openai.resources.chat.completions import (
|
|
261
|
+
Completions,
|
|
262
|
+
AsyncCompletions,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Restore original methods for Completions
|
|
266
|
+
if "Completions.create" in _ORIGINAL_METHODS:
|
|
267
|
+
Completions.create = _ORIGINAL_METHODS["Completions.create"]
|
|
268
|
+
|
|
269
|
+
if "Completions.parse" in _ORIGINAL_METHODS:
|
|
270
|
+
Completions.parse = _ORIGINAL_METHODS["Completions.parse"]
|
|
271
|
+
|
|
272
|
+
# Restore original methods for AsyncCompletions
|
|
273
|
+
if "AsyncCompletions.create" in _ORIGINAL_METHODS:
|
|
274
|
+
AsyncCompletions.create = _ORIGINAL_METHODS[
|
|
275
|
+
"AsyncCompletions.create"
|
|
276
|
+
]
|
|
277
|
+
|
|
278
|
+
if "AsyncCompletions.parse" in _ORIGINAL_METHODS:
|
|
279
|
+
AsyncCompletions.parse = _ORIGINAL_METHODS["AsyncCompletions.parse"]
|
|
280
|
+
|
|
281
|
+
except ImportError:
|
|
282
|
+
pass
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
from openai.resources.responses import Responses, AsyncResponses
|
|
286
|
+
|
|
287
|
+
# Restore original methods for Responses
|
|
288
|
+
if "Responses.create" in _ORIGINAL_METHODS:
|
|
289
|
+
Responses.create = _ORIGINAL_METHODS["Responses.create"]
|
|
290
|
+
|
|
291
|
+
# Restore original methods for AsyncResponses
|
|
292
|
+
if "AsyncResponses.create" in _ORIGINAL_METHODS:
|
|
293
|
+
AsyncResponses.create = _ORIGINAL_METHODS["AsyncResponses.create"]
|
|
294
|
+
|
|
295
|
+
except ImportError:
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
# Reset the patched flag
|
|
299
|
+
_OPENAI_PATCHED = False
|
deepeval/openai/types.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Any, Optional, List, Dict
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class InputParameters(BaseModel):
|
|
8
|
+
model: Optional[str] = None
|
|
9
|
+
input: Optional[str] = None
|
|
10
|
+
tools: Optional[List[Dict[str, Any]]] = None
|
|
11
|
+
instructions: Optional[str] = None
|
|
12
|
+
messages: Optional[List[Dict[str, Any]]] = None
|
|
13
|
+
tool_descriptions: Optional[Dict[str, str]] = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OutputParameters(BaseModel):
|
|
17
|
+
output: Optional[str] = None
|
|
18
|
+
prompt_tokens: Optional[int] = None
|
|
19
|
+
completion_tokens: Optional[int] = None
|
|
20
|
+
tools_called: Optional[List[ToolCall]] = None
|