deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +104 -36
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +1662 -688
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
deepeval/openai/extractors.py
CHANGED
|
@@ -4,17 +4,26 @@ from typing import Any, Union, Dict
|
|
|
4
4
|
from openai.types.responses import Response
|
|
5
5
|
|
|
6
6
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
7
|
-
from deepeval.openai.utils import
|
|
7
|
+
from deepeval.openai.utils import (
|
|
8
|
+
render_response_input,
|
|
9
|
+
stringify_multimodal_content,
|
|
10
|
+
render_messages,
|
|
11
|
+
)
|
|
8
12
|
from deepeval.openai.types import InputParameters, OutputParameters
|
|
13
|
+
from deepeval.tracing.types import Message
|
|
9
14
|
|
|
10
15
|
|
|
11
|
-
|
|
16
|
+
# guarding against errors to be compatible with legacy APIs
|
|
17
|
+
def safe_extract_input_parameters(
|
|
12
18
|
is_completion: bool, kwargs: Dict[str, Any]
|
|
13
19
|
) -> InputParameters:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
20
|
+
try:
|
|
21
|
+
if is_completion:
|
|
22
|
+
return extract_input_parameters_from_completion(kwargs)
|
|
23
|
+
else:
|
|
24
|
+
return extract_input_parameters_from_response(kwargs)
|
|
25
|
+
except:
|
|
26
|
+
return InputParameters(model="NA")
|
|
18
27
|
|
|
19
28
|
|
|
20
29
|
def extract_input_parameters_from_completion(
|
|
@@ -43,6 +52,9 @@ def extract_input_parameters_from_completion(
|
|
|
43
52
|
if len(user_messages) > 0:
|
|
44
53
|
input_arg = user_messages[0]
|
|
45
54
|
|
|
55
|
+
# render messages
|
|
56
|
+
messages = render_messages(messages)
|
|
57
|
+
|
|
46
58
|
return InputParameters(
|
|
47
59
|
model=model,
|
|
48
60
|
input=stringify_multimodal_content(input_arg),
|
|
@@ -64,7 +76,24 @@ def extract_input_parameters_from_response(
|
|
|
64
76
|
if tools is not None
|
|
65
77
|
else None
|
|
66
78
|
)
|
|
67
|
-
messages =
|
|
79
|
+
messages = []
|
|
80
|
+
if isinstance(input_payload, list):
|
|
81
|
+
messages = render_response_input(input_payload)
|
|
82
|
+
elif isinstance(input_payload, str):
|
|
83
|
+
messages = [
|
|
84
|
+
{
|
|
85
|
+
"role": "user",
|
|
86
|
+
"content": input_payload,
|
|
87
|
+
}
|
|
88
|
+
]
|
|
89
|
+
if instructions:
|
|
90
|
+
messages.insert(
|
|
91
|
+
0,
|
|
92
|
+
{
|
|
93
|
+
"role": "system",
|
|
94
|
+
"content": instructions,
|
|
95
|
+
},
|
|
96
|
+
)
|
|
68
97
|
return InputParameters(
|
|
69
98
|
model=model,
|
|
70
99
|
input=stringify_multimodal_content(input_payload),
|
|
@@ -75,19 +104,24 @@ def extract_input_parameters_from_response(
|
|
|
75
104
|
)
|
|
76
105
|
|
|
77
106
|
|
|
78
|
-
def
|
|
107
|
+
def safe_extract_output_parameters(
|
|
79
108
|
is_completion: bool,
|
|
80
109
|
response: Union[ChatCompletion, ParsedChatCompletion, Response],
|
|
81
110
|
input_parameters: InputParameters,
|
|
82
111
|
) -> OutputParameters:
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
112
|
+
|
|
113
|
+
# guarding against errors to be compatible with legacy APIs
|
|
114
|
+
try:
|
|
115
|
+
if is_completion:
|
|
116
|
+
return extract_output_parameters_from_completion(
|
|
117
|
+
response, input_parameters
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
return extract_output_parameters_from_response(
|
|
121
|
+
response, input_parameters
|
|
122
|
+
)
|
|
123
|
+
except:
|
|
124
|
+
return OutputParameters()
|
|
91
125
|
|
|
92
126
|
|
|
93
127
|
def extract_output_parameters_from_completion(
|
|
@@ -113,6 +147,12 @@ def extract_output_parameters_from_completion(
|
|
|
113
147
|
)
|
|
114
148
|
)
|
|
115
149
|
|
|
150
|
+
if not output and tools_called:
|
|
151
|
+
tool_calls = []
|
|
152
|
+
for tool_call in tools_called:
|
|
153
|
+
tool_calls.append(tool_call)
|
|
154
|
+
output = tool_calls
|
|
155
|
+
|
|
116
156
|
return OutputParameters(
|
|
117
157
|
output=output,
|
|
118
158
|
prompt_tokens=prompt_tokens,
|
|
@@ -144,6 +184,11 @@ def extract_output_parameters_from_response(
|
|
|
144
184
|
description=tool_descriptions.get(tool_call.name),
|
|
145
185
|
)
|
|
146
186
|
)
|
|
187
|
+
if not output and tools_called:
|
|
188
|
+
tool_calls = []
|
|
189
|
+
for tool_call in tools_called:
|
|
190
|
+
tool_calls.append(tool_call)
|
|
191
|
+
output = tool_calls
|
|
147
192
|
|
|
148
193
|
return OutputParameters(
|
|
149
194
|
output=output,
|
deepeval/openai/patch.py
CHANGED
|
@@ -3,8 +3,8 @@ from functools import wraps
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
from deepeval.openai.extractors import (
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
safe_extract_output_parameters,
|
|
7
|
+
safe_extract_input_parameters,
|
|
8
8
|
InputParameters,
|
|
9
9
|
OutputParameters,
|
|
10
10
|
)
|
|
@@ -16,7 +16,6 @@ from deepeval.tracing.context import (
|
|
|
16
16
|
)
|
|
17
17
|
from deepeval.tracing import observe
|
|
18
18
|
from deepeval.tracing.trace_context import current_llm_context
|
|
19
|
-
from deepeval.openai.utils import create_child_tool_spans
|
|
20
19
|
|
|
21
20
|
# Store original methods for safety and potential unpatching
|
|
22
21
|
_ORIGINAL_METHODS = {}
|
|
@@ -123,7 +122,7 @@ def _patch_async_openai_client_method(
|
|
|
123
122
|
):
|
|
124
123
|
@wraps(orig_method)
|
|
125
124
|
async def patched_async_openai_method(*args, **kwargs):
|
|
126
|
-
input_parameters: InputParameters =
|
|
125
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
127
126
|
is_completion_method, kwargs
|
|
128
127
|
)
|
|
129
128
|
|
|
@@ -137,7 +136,7 @@ def _patch_async_openai_client_method(
|
|
|
137
136
|
)
|
|
138
137
|
async def llm_generation(*args, **kwargs):
|
|
139
138
|
response = await orig_method(*args, **kwargs)
|
|
140
|
-
output_parameters =
|
|
139
|
+
output_parameters = safe_extract_output_parameters(
|
|
141
140
|
is_completion_method, response, input_parameters
|
|
142
141
|
)
|
|
143
142
|
_update_all_attributes(
|
|
@@ -162,7 +161,7 @@ def _patch_sync_openai_client_method(
|
|
|
162
161
|
):
|
|
163
162
|
@wraps(orig_method)
|
|
164
163
|
def patched_sync_openai_method(*args, **kwargs):
|
|
165
|
-
input_parameters: InputParameters =
|
|
164
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
166
165
|
is_completion_method, kwargs
|
|
167
166
|
)
|
|
168
167
|
|
|
@@ -176,7 +175,7 @@ def _patch_sync_openai_client_method(
|
|
|
176
175
|
)
|
|
177
176
|
def llm_generation(*args, **kwargs):
|
|
178
177
|
response = orig_method(*args, **kwargs)
|
|
179
|
-
output_parameters =
|
|
178
|
+
output_parameters = safe_extract_output_parameters(
|
|
180
179
|
is_completion_method, response, input_parameters
|
|
181
180
|
)
|
|
182
181
|
_update_all_attributes(
|
|
@@ -205,8 +204,8 @@ def _update_all_attributes(
|
|
|
205
204
|
):
|
|
206
205
|
"""Update span and trace attributes with input/output parameters."""
|
|
207
206
|
update_current_span(
|
|
208
|
-
input=input_parameters.
|
|
209
|
-
output=output_parameters.output or
|
|
207
|
+
input=input_parameters.messages,
|
|
208
|
+
output=output_parameters.output or output_parameters.tools_called,
|
|
210
209
|
tools_called=output_parameters.tools_called,
|
|
211
210
|
# attributes to be added
|
|
212
211
|
expected_output=expected_output,
|
|
@@ -223,9 +222,6 @@ def _update_all_attributes(
|
|
|
223
222
|
prompt=llm_context.prompt,
|
|
224
223
|
)
|
|
225
224
|
|
|
226
|
-
if output_parameters.tools_called:
|
|
227
|
-
create_child_tool_spans(output_parameters)
|
|
228
|
-
|
|
229
225
|
__update_input_and_output_of_current_trace(
|
|
230
226
|
input_parameters, output_parameters
|
|
231
227
|
)
|
deepeval/openai/types.py
CHANGED
|
@@ -14,7 +14,7 @@ class InputParameters(BaseModel):
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class OutputParameters(BaseModel):
|
|
17
|
-
output: Optional[
|
|
17
|
+
output: Optional[Any] = None
|
|
18
18
|
prompt_tokens: Optional[int] = None
|
|
19
19
|
completion_tokens: Optional[int] = None
|
|
20
20
|
tools_called: Optional[List[ToolCall]] = None
|
deepeval/openai/utils.py
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
|
-
from typing import Any, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Iterable
|
|
4
|
+
|
|
5
|
+
from openai.types.chat.chat_completion_message_param import (
|
|
6
|
+
ChatCompletionMessageParam,
|
|
7
|
+
)
|
|
4
8
|
|
|
5
9
|
from deepeval.tracing.types import ToolSpan, TraceSpanStatus
|
|
6
10
|
from deepeval.tracing.context import current_span_context
|
|
@@ -126,3 +130,106 @@ def stringify_multimodal_content(content: Any) -> str:
|
|
|
126
130
|
|
|
127
131
|
# unknown dicts and types returned as shortened JSON
|
|
128
132
|
return _compact_dump(content)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def render_messages(
|
|
136
|
+
messages: Iterable[ChatCompletionMessageParam],
|
|
137
|
+
) -> List[Dict[str, Any]]:
|
|
138
|
+
|
|
139
|
+
messages_list = []
|
|
140
|
+
|
|
141
|
+
for message in messages:
|
|
142
|
+
role = message.get("role")
|
|
143
|
+
content = message.get("content")
|
|
144
|
+
if role == "assistant" and message.get("tool_calls"):
|
|
145
|
+
tool_calls = message.get("tool_calls")
|
|
146
|
+
if isinstance(tool_calls, list):
|
|
147
|
+
for tool_call in tool_calls:
|
|
148
|
+
# Extract type - either "function" or "custom"
|
|
149
|
+
tool_type = tool_call.get("type", "function")
|
|
150
|
+
|
|
151
|
+
# Extract name and arguments based on type
|
|
152
|
+
if tool_type == "function":
|
|
153
|
+
function_data = tool_call.get("function", {})
|
|
154
|
+
name = function_data.get("name", "")
|
|
155
|
+
arguments = function_data.get("arguments", "")
|
|
156
|
+
elif tool_type == "custom":
|
|
157
|
+
custom_data = tool_call.get("custom", {})
|
|
158
|
+
name = custom_data.get("name", "")
|
|
159
|
+
arguments = custom_data.get("input", "")
|
|
160
|
+
else:
|
|
161
|
+
name = ""
|
|
162
|
+
arguments = ""
|
|
163
|
+
|
|
164
|
+
messages_list.append(
|
|
165
|
+
{
|
|
166
|
+
"id": tool_call.get("id", ""),
|
|
167
|
+
"call_id": tool_call.get(
|
|
168
|
+
"id", ""
|
|
169
|
+
), # OpenAI uses 'id', not 'call_id'
|
|
170
|
+
"name": name,
|
|
171
|
+
"type": tool_type,
|
|
172
|
+
"arguments": json.loads(arguments),
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
elif role == "tool":
|
|
177
|
+
messages_list.append(
|
|
178
|
+
{
|
|
179
|
+
"call_id": message.get("tool_call_id", ""),
|
|
180
|
+
"type": role, # "tool"
|
|
181
|
+
"output": message.get("content", {}),
|
|
182
|
+
}
|
|
183
|
+
)
|
|
184
|
+
else:
|
|
185
|
+
messages_list.append(
|
|
186
|
+
{
|
|
187
|
+
"role": role,
|
|
188
|
+
"content": content,
|
|
189
|
+
}
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
return messages_list
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def render_response_input(input: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
196
|
+
|
|
197
|
+
messages_list = []
|
|
198
|
+
|
|
199
|
+
for item in input:
|
|
200
|
+
type = item.get("type")
|
|
201
|
+
role = item.get("role")
|
|
202
|
+
|
|
203
|
+
if type == "message":
|
|
204
|
+
messages_list.append(
|
|
205
|
+
{
|
|
206
|
+
"role": role,
|
|
207
|
+
"content": item.get("content"),
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
messages_list.append(item)
|
|
212
|
+
|
|
213
|
+
return messages_list
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
|
|
217
|
+
"""
|
|
218
|
+
Renders a dictionary as a formatted string with indentation for nested structures.
|
|
219
|
+
"""
|
|
220
|
+
if not content:
|
|
221
|
+
return ""
|
|
222
|
+
|
|
223
|
+
lines = []
|
|
224
|
+
prefix = " " * indent
|
|
225
|
+
|
|
226
|
+
for key, value in content.items():
|
|
227
|
+
if isinstance(value, dict):
|
|
228
|
+
lines.append(f"{prefix}{key}:")
|
|
229
|
+
lines.append(_render_content(value, indent + 1))
|
|
230
|
+
elif isinstance(value, list):
|
|
231
|
+
lines.append(f"{prefix}{key}: {_compact_dump(value)}")
|
|
232
|
+
else:
|
|
233
|
+
lines.append(f"{prefix}{key}: {value}")
|
|
234
|
+
|
|
235
|
+
return "\n".join(lines)
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -202,6 +202,7 @@ class Prompt:
|
|
|
202
202
|
"Unable to interpolate empty prompt template. Please pull a prompt from Confident AI or set template manually to continue."
|
|
203
203
|
)
|
|
204
204
|
|
|
205
|
+
print("@@@@@")
|
|
205
206
|
return interpolate_text(interpolation_type, text_template, **kwargs)
|
|
206
207
|
|
|
207
208
|
elif prompt_type == PromptType.LIST:
|
deepeval/prompt/utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import uuid
|
|
3
3
|
from jinja2 import Template
|
|
4
|
-
from typing import Any, Dict, Type, Optional, List
|
|
4
|
+
from typing import Any, Dict, Type, Optional, List, Match
|
|
5
5
|
from pydantic import BaseModel, create_model
|
|
6
6
|
|
|
7
7
|
from deepeval.prompt.api import (
|
|
@@ -16,36 +16,65 @@ from deepeval.prompt.api import (
|
|
|
16
16
|
###################################
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def interpolate_mustache(text: str, **kwargs) -> str:
|
|
19
|
+
def interpolate_mustache(text: str, **kwargs: Any) -> str:
|
|
20
20
|
"""Interpolate using Mustache format: {{variable}}"""
|
|
21
|
-
formatted_template = re.sub(r"\{\{(\w+)\}\}", r"{\1}", text)
|
|
22
|
-
return formatted_template.format(**kwargs)
|
|
23
21
|
|
|
22
|
+
def replace_match(match: Match[str]) -> str:
|
|
23
|
+
var_name = match.group(1)
|
|
24
|
+
if var_name in kwargs:
|
|
25
|
+
return str(kwargs[var_name])
|
|
26
|
+
# Raise error for missing variables to maintain consistency
|
|
27
|
+
raise KeyError(f"Missing variable in template: {var_name}")
|
|
24
28
|
|
|
25
|
-
|
|
29
|
+
return re.sub(r"\{\{([a-zA-Z_][a-zA-Z0-9_]*)\}\}", replace_match, text)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def interpolate_mustache_with_space(text: str, **kwargs: Any) -> str:
|
|
26
33
|
"""Interpolate using Mustache with space format: {{ variable }}"""
|
|
27
|
-
|
|
28
|
-
|
|
34
|
+
|
|
35
|
+
def replace_match(match: Match[str]) -> str:
|
|
36
|
+
var_name = match.group(1)
|
|
37
|
+
if var_name in kwargs:
|
|
38
|
+
return str(kwargs[var_name])
|
|
39
|
+
# Raise error for missing variables to maintain consistency
|
|
40
|
+
raise KeyError(f"Missing variable in template: {var_name}")
|
|
41
|
+
|
|
42
|
+
return re.sub(r"\{\{ ([a-zA-Z_][a-zA-Z0-9_]*) \}\}", replace_match, text)
|
|
29
43
|
|
|
30
44
|
|
|
31
|
-
def interpolate_fstring(text: str, **kwargs) -> str:
|
|
45
|
+
def interpolate_fstring(text: str, **kwargs: Any) -> str:
|
|
32
46
|
"""Interpolate using F-string format: {variable}"""
|
|
33
|
-
return text.format(**kwargs)
|
|
34
47
|
|
|
48
|
+
def replace_match(match: Match[str]) -> str:
|
|
49
|
+
var_name = match.group(1)
|
|
50
|
+
if var_name in kwargs:
|
|
51
|
+
return str(kwargs[var_name])
|
|
52
|
+
# Raise error for missing variables to maintain consistency
|
|
53
|
+
raise KeyError(f"Missing variable in template: {var_name}")
|
|
35
54
|
|
|
36
|
-
|
|
55
|
+
return re.sub(r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}", replace_match, text)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def interpolate_dollar_brackets(text: str, **kwargs: Any) -> str:
|
|
37
59
|
"""Interpolate using Dollar Brackets format: ${variable}"""
|
|
38
|
-
|
|
39
|
-
|
|
60
|
+
|
|
61
|
+
def replace_match(match: Match[str]) -> str:
|
|
62
|
+
var_name = match.group(1)
|
|
63
|
+
if var_name in kwargs:
|
|
64
|
+
return str(kwargs[var_name])
|
|
65
|
+
# Raise error for missing variables to maintain consistency
|
|
66
|
+
raise KeyError(f"Missing variable in template: {var_name}")
|
|
67
|
+
|
|
68
|
+
return re.sub(r"\$\{([a-zA-Z_][a-zA-Z0-9_]*)\}", replace_match, text)
|
|
40
69
|
|
|
41
70
|
|
|
42
|
-
def interpolate_jinja(text: str, **kwargs) -> str:
|
|
71
|
+
def interpolate_jinja(text: str, **kwargs: Any) -> str:
|
|
43
72
|
template = Template(text)
|
|
44
73
|
return template.render(**kwargs)
|
|
45
74
|
|
|
46
75
|
|
|
47
76
|
def interpolate_text(
|
|
48
|
-
interpolation_type: PromptInterpolationType, text: str, **kwargs
|
|
77
|
+
interpolation_type: PromptInterpolationType, text: str, **kwargs: Any
|
|
49
78
|
) -> str:
|
|
50
79
|
"""Apply the appropriate interpolation method based on the type"""
|
|
51
80
|
if interpolation_type == PromptInterpolationType.MUSTACHE:
|
|
@@ -35,7 +35,6 @@ class ConversationSimulator:
|
|
|
35
35
|
self,
|
|
36
36
|
model_callback: Callable[[str], str],
|
|
37
37
|
simulator_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
38
|
-
opening_message: Optional[str] = None,
|
|
39
38
|
max_concurrent: int = 5,
|
|
40
39
|
async_mode: bool = True,
|
|
41
40
|
language: str = "English",
|
|
@@ -45,7 +44,6 @@ class ConversationSimulator:
|
|
|
45
44
|
self.is_callback_async = inspect.iscoroutinefunction(
|
|
46
45
|
self.model_callback
|
|
47
46
|
)
|
|
48
|
-
self.opening_message = opening_message
|
|
49
47
|
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
50
48
|
self.async_mode = async_mode
|
|
51
49
|
self.language = language
|
|
@@ -68,6 +66,9 @@ class ConversationSimulator:
|
|
|
68
66
|
self,
|
|
69
67
|
conversational_goldens: List[ConversationalGolden],
|
|
70
68
|
max_user_simulations: int = 10,
|
|
69
|
+
on_simulation_complete: Optional[
|
|
70
|
+
Callable[[ConversationalTestCase, int], None]
|
|
71
|
+
] = None,
|
|
71
72
|
) -> List[ConversationalTestCase]:
|
|
72
73
|
self.simulation_cost = 0 if self.using_native_model else None
|
|
73
74
|
|
|
@@ -87,6 +88,7 @@ class ConversationSimulator:
|
|
|
87
88
|
self._a_simulate(
|
|
88
89
|
conversational_goldens=conversational_goldens,
|
|
89
90
|
max_user_simulations=max_user_simulations,
|
|
91
|
+
on_simulation_complete=on_simulation_complete,
|
|
90
92
|
progress=progress,
|
|
91
93
|
pbar_id=pbar_id,
|
|
92
94
|
)
|
|
@@ -103,6 +105,7 @@ class ConversationSimulator:
|
|
|
103
105
|
index=conversation_index,
|
|
104
106
|
progress=progress,
|
|
105
107
|
pbar_id=pbar_id,
|
|
108
|
+
on_simulation_complete=on_simulation_complete,
|
|
106
109
|
)
|
|
107
110
|
)
|
|
108
111
|
conversational_test_cases.append(conversational_test_case)
|
|
@@ -115,6 +118,9 @@ class ConversationSimulator:
|
|
|
115
118
|
self,
|
|
116
119
|
conversational_goldens: List[ConversationalGolden],
|
|
117
120
|
max_user_simulations: int,
|
|
121
|
+
on_simulation_complete: Optional[
|
|
122
|
+
Callable[[ConversationalTestCase, int], None]
|
|
123
|
+
] = None,
|
|
118
124
|
progress: Optional[Progress] = None,
|
|
119
125
|
pbar_id: Optional[int] = None,
|
|
120
126
|
) -> List[ConversationalTestCase]:
|
|
@@ -131,6 +137,7 @@ class ConversationSimulator:
|
|
|
131
137
|
index=conversation_index,
|
|
132
138
|
progress=progress,
|
|
133
139
|
pbar_id=pbar_id,
|
|
140
|
+
on_simulation_complete=on_simulation_complete,
|
|
134
141
|
)
|
|
135
142
|
|
|
136
143
|
tasks = [
|
|
@@ -150,6 +157,9 @@ class ConversationSimulator:
|
|
|
150
157
|
index: int,
|
|
151
158
|
progress: Optional[Progress] = None,
|
|
152
159
|
pbar_id: Optional[int] = None,
|
|
160
|
+
on_simulation_complete: Optional[
|
|
161
|
+
Callable[[ConversationalTestCase, int], None]
|
|
162
|
+
] = None,
|
|
153
163
|
) -> ConversationalTestCase:
|
|
154
164
|
simulation_counter = 0
|
|
155
165
|
if max_user_simulations <= 0:
|
|
@@ -166,8 +176,6 @@ class ConversationSimulator:
|
|
|
166
176
|
user_input = None
|
|
167
177
|
thread_id = str(uuid.uuid4())
|
|
168
178
|
turns: List[Turn] = []
|
|
169
|
-
if self.opening_message and golden.turns is None:
|
|
170
|
-
turns.append(Turn(role="assistant", content=self.opening_message))
|
|
171
179
|
|
|
172
180
|
if golden.turns is not None:
|
|
173
181
|
turns.extend(golden.turns)
|
|
@@ -187,11 +195,7 @@ class ConversationSimulator:
|
|
|
187
195
|
if simulation_counter >= max_user_simulations:
|
|
188
196
|
update_pbar(progress, pbar_max_user_simluations_id)
|
|
189
197
|
break
|
|
190
|
-
if len(turns) == 0
|
|
191
|
-
len(turns) == 1
|
|
192
|
-
and self.opening_message
|
|
193
|
-
and golden.turns is None
|
|
194
|
-
):
|
|
198
|
+
if len(turns) == 0:
|
|
195
199
|
# Generate first user input
|
|
196
200
|
user_input = self.generate_first_user_input(golden)
|
|
197
201
|
turns.append(Turn(role="user", content=user_input))
|
|
@@ -225,7 +229,7 @@ class ConversationSimulator:
|
|
|
225
229
|
turns.append(turn)
|
|
226
230
|
|
|
227
231
|
update_pbar(progress, pbar_id)
|
|
228
|
-
|
|
232
|
+
conversational_test_case = ConversationalTestCase(
|
|
229
233
|
turns=turns,
|
|
230
234
|
scenario=golden.scenario,
|
|
231
235
|
expected_outcome=golden.expected_outcome,
|
|
@@ -241,6 +245,9 @@ class ConversationSimulator:
|
|
|
241
245
|
_dataset_alias=golden._dataset_alias,
|
|
242
246
|
_dataset_id=golden._dataset_id,
|
|
243
247
|
)
|
|
248
|
+
if on_simulation_complete:
|
|
249
|
+
on_simulation_complete(conversational_test_case, index)
|
|
250
|
+
return conversational_test_case
|
|
244
251
|
|
|
245
252
|
async def _a_simulate_single_conversation(
|
|
246
253
|
self,
|
|
@@ -249,6 +256,9 @@ class ConversationSimulator:
|
|
|
249
256
|
index: Optional[int] = None,
|
|
250
257
|
progress: Optional[Progress] = None,
|
|
251
258
|
pbar_id: Optional[int] = None,
|
|
259
|
+
on_simulation_complete: Optional[
|
|
260
|
+
Callable[[ConversationalTestCase, int], None]
|
|
261
|
+
] = None,
|
|
252
262
|
) -> ConversationalTestCase:
|
|
253
263
|
simulation_counter = 0
|
|
254
264
|
if max_user_simulations <= 0:
|
|
@@ -265,8 +275,6 @@ class ConversationSimulator:
|
|
|
265
275
|
user_input = None
|
|
266
276
|
thread_id = str(uuid.uuid4())
|
|
267
277
|
turns: List[Turn] = []
|
|
268
|
-
if self.opening_message and golden.turns is None:
|
|
269
|
-
turns.append(Turn(role="assistant", content=self.opening_message))
|
|
270
278
|
|
|
271
279
|
if golden.turns is not None:
|
|
272
280
|
turns.extend(golden.turns)
|
|
@@ -286,11 +294,7 @@ class ConversationSimulator:
|
|
|
286
294
|
if simulation_counter >= max_user_simulations:
|
|
287
295
|
update_pbar(progress, pbar_max_user_simluations_id)
|
|
288
296
|
break
|
|
289
|
-
if len(turns) == 0
|
|
290
|
-
len(turns) == 1
|
|
291
|
-
and self.opening_message
|
|
292
|
-
and golden.turns is None
|
|
293
|
-
):
|
|
297
|
+
if len(turns) == 0:
|
|
294
298
|
# Generate first user input
|
|
295
299
|
user_input = await self.a_generate_first_user_input(golden)
|
|
296
300
|
turns.append(Turn(role="user", content=user_input))
|
|
@@ -324,7 +328,7 @@ class ConversationSimulator:
|
|
|
324
328
|
turns.append(turn)
|
|
325
329
|
|
|
326
330
|
update_pbar(progress, pbar_id)
|
|
327
|
-
|
|
331
|
+
conversational_test_case = ConversationalTestCase(
|
|
328
332
|
turns=turns,
|
|
329
333
|
scenario=golden.scenario,
|
|
330
334
|
expected_outcome=golden.expected_outcome,
|
|
@@ -340,6 +344,9 @@ class ConversationSimulator:
|
|
|
340
344
|
_dataset_alias=golden._dataset_alias,
|
|
341
345
|
_dataset_id=golden._dataset_id,
|
|
342
346
|
)
|
|
347
|
+
if on_simulation_complete:
|
|
348
|
+
on_simulation_complete(conversational_test_case, index)
|
|
349
|
+
return conversational_test_case
|
|
343
350
|
|
|
344
351
|
############################################
|
|
345
352
|
### Generate User Inputs ###################
|
|
@@ -249,8 +249,16 @@ class ContextGenerator:
|
|
|
249
249
|
|
|
250
250
|
except Exception as exc:
|
|
251
251
|
# record and continue with other docs
|
|
252
|
+
show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
|
|
253
|
+
exc_info = (
|
|
254
|
+
(type(exc), exc, getattr(exc, "__traceback__", None))
|
|
255
|
+
if show_trace
|
|
256
|
+
else None
|
|
257
|
+
)
|
|
252
258
|
logger.exception(
|
|
253
|
-
"Document pipeline failed for %s",
|
|
259
|
+
"Document pipeline failed for %s",
|
|
260
|
+
path,
|
|
261
|
+
exc_info=exc_info,
|
|
254
262
|
)
|
|
255
263
|
finally:
|
|
256
264
|
# drop the collection asap to avoid too many open collections
|
|
@@ -555,7 +555,7 @@ class Synthesizer:
|
|
|
555
555
|
include_expected_output=include_expected_output,
|
|
556
556
|
max_goldens_per_context=max_goldens_per_context,
|
|
557
557
|
source_files=source_files,
|
|
558
|
-
|
|
558
|
+
context_index=index,
|
|
559
559
|
progress=progress,
|
|
560
560
|
pbar_id=pbar_id,
|
|
561
561
|
context_scores=_context_scores,
|
|
@@ -577,7 +577,7 @@ class Synthesizer:
|
|
|
577
577
|
include_expected_output: bool,
|
|
578
578
|
max_goldens_per_context: int,
|
|
579
579
|
source_files: Optional[List[str]],
|
|
580
|
-
|
|
580
|
+
context_index: int,
|
|
581
581
|
progress: Optional[Progress] = None,
|
|
582
582
|
pbar_id: Optional[int] = None,
|
|
583
583
|
context_scores: Optional[List[float]] = None,
|
|
@@ -599,7 +599,7 @@ class Synthesizer:
|
|
|
599
599
|
# Add pbars
|
|
600
600
|
pbar_generate_goldens_id = add_pbar(
|
|
601
601
|
progress,
|
|
602
|
-
f"\t⚡ Generating goldens from context #{
|
|
602
|
+
f"\t⚡ Generating goldens from context #{context_index}",
|
|
603
603
|
total=1 + max_goldens_per_context,
|
|
604
604
|
)
|
|
605
605
|
pbar_generate_inputs_id = add_pbar(
|
|
@@ -643,7 +643,7 @@ class Synthesizer:
|
|
|
643
643
|
|
|
644
644
|
# Helper function to process each input in parallel
|
|
645
645
|
async def process_input(
|
|
646
|
-
|
|
646
|
+
input_index: int,
|
|
647
647
|
data: SyntheticData,
|
|
648
648
|
progress: Optional[Progress] = None,
|
|
649
649
|
):
|
|
@@ -654,7 +654,7 @@ class Synthesizer:
|
|
|
654
654
|
num_evolutions=self.evolution_config.num_evolutions,
|
|
655
655
|
evolutions=self.evolution_config.evolutions,
|
|
656
656
|
progress=progress,
|
|
657
|
-
pbar_evolve_input_id=pbar_evolve_input_ids[
|
|
657
|
+
pbar_evolve_input_id=pbar_evolve_input_ids[input_index],
|
|
658
658
|
remove_pbar=False,
|
|
659
659
|
)
|
|
660
660
|
|
|
@@ -672,7 +672,7 @@ class Synthesizer:
|
|
|
672
672
|
)
|
|
673
673
|
evolved_input = res.input
|
|
674
674
|
update_pbar(
|
|
675
|
-
progress, pbar_evolve_input_ids[
|
|
675
|
+
progress, pbar_evolve_input_ids[input_index], remove=False
|
|
676
676
|
)
|
|
677
677
|
|
|
678
678
|
# Generate expected output
|
|
@@ -685,7 +685,7 @@ class Synthesizer:
|
|
|
685
685
|
)
|
|
686
686
|
expected_output = await self._a_generate(expected_output_prompt)
|
|
687
687
|
update_pbar(
|
|
688
|
-
progress, pbar_evolve_input_ids[
|
|
688
|
+
progress, pbar_evolve_input_ids[input_index], remove=False
|
|
689
689
|
)
|
|
690
690
|
|
|
691
691
|
# Create Golden
|
|
@@ -694,13 +694,14 @@ class Synthesizer:
|
|
|
694
694
|
context=context,
|
|
695
695
|
expected_output=expected_output,
|
|
696
696
|
source_file=(
|
|
697
|
-
source_files[
|
|
698
|
-
if source_files is not None
|
|
697
|
+
source_files[context_index]
|
|
698
|
+
if source_files is not None
|
|
699
|
+
and context_index < len(source_files)
|
|
699
700
|
else None
|
|
700
701
|
),
|
|
701
702
|
additional_metadata={
|
|
702
703
|
"evolutions": evolutions_used,
|
|
703
|
-
"synthetic_input_quality": scores[
|
|
704
|
+
"synthetic_input_quality": scores[input_index],
|
|
704
705
|
# "context_quality": (
|
|
705
706
|
# context_scores[data_index]
|
|
706
707
|
# if context_scores is not None
|