deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -4,17 +4,26 @@ from typing import Any, Union, Dict
4
4
  from openai.types.responses import Response
5
5
 
6
6
  from deepeval.test_case.llm_test_case import ToolCall
7
- from deepeval.openai.utils import stringify_multimodal_content
7
+ from deepeval.openai.utils import (
8
+ render_response_input,
9
+ stringify_multimodal_content,
10
+ render_messages,
11
+ )
8
12
  from deepeval.openai.types import InputParameters, OutputParameters
13
+ from deepeval.tracing.types import Message
9
14
 
10
15
 
11
- def extract_input_parameters(
16
+ # guarding against errors to be compatible with legacy APIs
17
+ def safe_extract_input_parameters(
12
18
  is_completion: bool, kwargs: Dict[str, Any]
13
19
  ) -> InputParameters:
14
- if is_completion:
15
- return extract_input_parameters_from_completion(kwargs)
16
- else:
17
- return extract_input_parameters_from_response(kwargs)
20
+ try:
21
+ if is_completion:
22
+ return extract_input_parameters_from_completion(kwargs)
23
+ else:
24
+ return extract_input_parameters_from_response(kwargs)
25
+ except:
26
+ return InputParameters(model="NA")
18
27
 
19
28
 
20
29
  def extract_input_parameters_from_completion(
@@ -43,6 +52,9 @@ def extract_input_parameters_from_completion(
43
52
  if len(user_messages) > 0:
44
53
  input_arg = user_messages[0]
45
54
 
55
+ # render messages
56
+ messages = render_messages(messages)
57
+
46
58
  return InputParameters(
47
59
  model=model,
48
60
  input=stringify_multimodal_content(input_arg),
@@ -64,7 +76,24 @@ def extract_input_parameters_from_response(
64
76
  if tools is not None
65
77
  else None
66
78
  )
67
- messages = input_payload if isinstance(input_payload, list) else None
79
+ messages = []
80
+ if isinstance(input_payload, list):
81
+ messages = render_response_input(input_payload)
82
+ elif isinstance(input_payload, str):
83
+ messages = [
84
+ {
85
+ "role": "user",
86
+ "content": input_payload,
87
+ }
88
+ ]
89
+ if instructions:
90
+ messages.insert(
91
+ 0,
92
+ {
93
+ "role": "system",
94
+ "content": instructions,
95
+ },
96
+ )
68
97
  return InputParameters(
69
98
  model=model,
70
99
  input=stringify_multimodal_content(input_payload),
@@ -75,19 +104,24 @@ def extract_input_parameters_from_response(
75
104
  )
76
105
 
77
106
 
78
- def extract_output_parameters(
107
+ def safe_extract_output_parameters(
79
108
  is_completion: bool,
80
109
  response: Union[ChatCompletion, ParsedChatCompletion, Response],
81
110
  input_parameters: InputParameters,
82
111
  ) -> OutputParameters:
83
- if is_completion:
84
- return extract_output_parameters_from_completion(
85
- response, input_parameters
86
- )
87
- else:
88
- return extract_output_parameters_from_response(
89
- response, input_parameters
90
- )
112
+
113
+ # guarding against errors to be compatible with legacy APIs
114
+ try:
115
+ if is_completion:
116
+ return extract_output_parameters_from_completion(
117
+ response, input_parameters
118
+ )
119
+ else:
120
+ return extract_output_parameters_from_response(
121
+ response, input_parameters
122
+ )
123
+ except:
124
+ return OutputParameters()
91
125
 
92
126
 
93
127
  def extract_output_parameters_from_completion(
@@ -113,6 +147,12 @@ def extract_output_parameters_from_completion(
113
147
  )
114
148
  )
115
149
 
150
+ if not output and tools_called:
151
+ tool_calls = []
152
+ for tool_call in tools_called:
153
+ tool_calls.append(tool_call)
154
+ output = tool_calls
155
+
116
156
  return OutputParameters(
117
157
  output=output,
118
158
  prompt_tokens=prompt_tokens,
@@ -144,6 +184,11 @@ def extract_output_parameters_from_response(
144
184
  description=tool_descriptions.get(tool_call.name),
145
185
  )
146
186
  )
187
+ if not output and tools_called:
188
+ tool_calls = []
189
+ for tool_call in tools_called:
190
+ tool_calls.append(tool_call)
191
+ output = tool_calls
147
192
 
148
193
  return OutputParameters(
149
194
  output=output,
deepeval/openai/patch.py CHANGED
@@ -3,8 +3,8 @@ from functools import wraps
3
3
 
4
4
 
5
5
  from deepeval.openai.extractors import (
6
- extract_output_parameters,
7
- extract_input_parameters,
6
+ safe_extract_output_parameters,
7
+ safe_extract_input_parameters,
8
8
  InputParameters,
9
9
  OutputParameters,
10
10
  )
@@ -16,7 +16,6 @@ from deepeval.tracing.context import (
16
16
  )
17
17
  from deepeval.tracing import observe
18
18
  from deepeval.tracing.trace_context import current_llm_context
19
- from deepeval.openai.utils import create_child_tool_spans
20
19
 
21
20
  # Store original methods for safety and potential unpatching
22
21
  _ORIGINAL_METHODS = {}
@@ -123,7 +122,7 @@ def _patch_async_openai_client_method(
123
122
  ):
124
123
  @wraps(orig_method)
125
124
  async def patched_async_openai_method(*args, **kwargs):
126
- input_parameters: InputParameters = extract_input_parameters(
125
+ input_parameters: InputParameters = safe_extract_input_parameters(
127
126
  is_completion_method, kwargs
128
127
  )
129
128
 
@@ -137,7 +136,7 @@ def _patch_async_openai_client_method(
137
136
  )
138
137
  async def llm_generation(*args, **kwargs):
139
138
  response = await orig_method(*args, **kwargs)
140
- output_parameters = extract_output_parameters(
139
+ output_parameters = safe_extract_output_parameters(
141
140
  is_completion_method, response, input_parameters
142
141
  )
143
142
  _update_all_attributes(
@@ -162,7 +161,7 @@ def _patch_sync_openai_client_method(
162
161
  ):
163
162
  @wraps(orig_method)
164
163
  def patched_sync_openai_method(*args, **kwargs):
165
- input_parameters: InputParameters = extract_input_parameters(
164
+ input_parameters: InputParameters = safe_extract_input_parameters(
166
165
  is_completion_method, kwargs
167
166
  )
168
167
 
@@ -176,7 +175,7 @@ def _patch_sync_openai_client_method(
176
175
  )
177
176
  def llm_generation(*args, **kwargs):
178
177
  response = orig_method(*args, **kwargs)
179
- output_parameters = extract_output_parameters(
178
+ output_parameters = safe_extract_output_parameters(
180
179
  is_completion_method, response, input_parameters
181
180
  )
182
181
  _update_all_attributes(
@@ -205,8 +204,8 @@ def _update_all_attributes(
205
204
  ):
206
205
  """Update span and trace attributes with input/output parameters."""
207
206
  update_current_span(
208
- input=input_parameters.input or input_parameters.messages or "NA",
209
- output=output_parameters.output or "NA",
207
+ input=input_parameters.messages,
208
+ output=output_parameters.output or output_parameters.tools_called,
210
209
  tools_called=output_parameters.tools_called,
211
210
  # attributes to be added
212
211
  expected_output=expected_output,
@@ -223,9 +222,6 @@ def _update_all_attributes(
223
222
  prompt=llm_context.prompt,
224
223
  )
225
224
 
226
- if output_parameters.tools_called:
227
- create_child_tool_spans(output_parameters)
228
-
229
225
  __update_input_and_output_of_current_trace(
230
226
  input_parameters, output_parameters
231
227
  )
deepeval/openai/types.py CHANGED
@@ -14,7 +14,7 @@ class InputParameters(BaseModel):
14
14
 
15
15
 
16
16
  class OutputParameters(BaseModel):
17
- output: Optional[str] = None
17
+ output: Optional[Any] = None
18
18
  prompt_tokens: Optional[int] = None
19
19
  completion_tokens: Optional[int] = None
20
20
  tools_called: Optional[List[ToolCall]] = None
deepeval/openai/utils.py CHANGED
@@ -1,6 +1,10 @@
1
1
  import json
2
2
  import uuid
3
- from typing import Any, List, Optional
3
+ from typing import Any, Dict, List, Optional, Iterable
4
+
5
+ from openai.types.chat.chat_completion_message_param import (
6
+ ChatCompletionMessageParam,
7
+ )
4
8
 
5
9
  from deepeval.tracing.types import ToolSpan, TraceSpanStatus
6
10
  from deepeval.tracing.context import current_span_context
@@ -126,3 +130,106 @@ def stringify_multimodal_content(content: Any) -> str:
126
130
 
127
131
  # unknown dicts and types returned as shortened JSON
128
132
  return _compact_dump(content)
133
+
134
+
135
+ def render_messages(
136
+ messages: Iterable[ChatCompletionMessageParam],
137
+ ) -> List[Dict[str, Any]]:
138
+
139
+ messages_list = []
140
+
141
+ for message in messages:
142
+ role = message.get("role")
143
+ content = message.get("content")
144
+ if role == "assistant" and message.get("tool_calls"):
145
+ tool_calls = message.get("tool_calls")
146
+ if isinstance(tool_calls, list):
147
+ for tool_call in tool_calls:
148
+ # Extract type - either "function" or "custom"
149
+ tool_type = tool_call.get("type", "function")
150
+
151
+ # Extract name and arguments based on type
152
+ if tool_type == "function":
153
+ function_data = tool_call.get("function", {})
154
+ name = function_data.get("name", "")
155
+ arguments = function_data.get("arguments", "")
156
+ elif tool_type == "custom":
157
+ custom_data = tool_call.get("custom", {})
158
+ name = custom_data.get("name", "")
159
+ arguments = custom_data.get("input", "")
160
+ else:
161
+ name = ""
162
+ arguments = ""
163
+
164
+ messages_list.append(
165
+ {
166
+ "id": tool_call.get("id", ""),
167
+ "call_id": tool_call.get(
168
+ "id", ""
169
+ ), # OpenAI uses 'id', not 'call_id'
170
+ "name": name,
171
+ "type": tool_type,
172
+ "arguments": json.loads(arguments),
173
+ }
174
+ )
175
+
176
+ elif role == "tool":
177
+ messages_list.append(
178
+ {
179
+ "call_id": message.get("tool_call_id", ""),
180
+ "type": role, # "tool"
181
+ "output": message.get("content", {}),
182
+ }
183
+ )
184
+ else:
185
+ messages_list.append(
186
+ {
187
+ "role": role,
188
+ "content": content,
189
+ }
190
+ )
191
+
192
+ return messages_list
193
+
194
+
195
+ def render_response_input(input: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
196
+
197
+ messages_list = []
198
+
199
+ for item in input:
200
+ type = item.get("type")
201
+ role = item.get("role")
202
+
203
+ if type == "message":
204
+ messages_list.append(
205
+ {
206
+ "role": role,
207
+ "content": item.get("content"),
208
+ }
209
+ )
210
+ else:
211
+ messages_list.append(item)
212
+
213
+ return messages_list
214
+
215
+
216
+ def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
217
+ """
218
+ Renders a dictionary as a formatted string with indentation for nested structures.
219
+ """
220
+ if not content:
221
+ return ""
222
+
223
+ lines = []
224
+ prefix = " " * indent
225
+
226
+ for key, value in content.items():
227
+ if isinstance(value, dict):
228
+ lines.append(f"{prefix}{key}:")
229
+ lines.append(_render_content(value, indent + 1))
230
+ elif isinstance(value, list):
231
+ lines.append(f"{prefix}{key}: {_compact_dump(value)}")
232
+ else:
233
+ lines.append(f"{prefix}{key}: {value}")
234
+
235
+ return "\n".join(lines)
deepeval/prompt/prompt.py CHANGED
@@ -202,6 +202,7 @@ class Prompt:
202
202
  "Unable to interpolate empty prompt template. Please pull a prompt from Confident AI or set template manually to continue."
203
203
  )
204
204
 
205
+ print("@@@@@")
205
206
  return interpolate_text(interpolation_type, text_template, **kwargs)
206
207
 
207
208
  elif prompt_type == PromptType.LIST:
deepeval/prompt/utils.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  import uuid
3
3
  from jinja2 import Template
4
- from typing import Any, Dict, Type, Optional, List
4
+ from typing import Any, Dict, Type, Optional, List, Match
5
5
  from pydantic import BaseModel, create_model
6
6
 
7
7
  from deepeval.prompt.api import (
@@ -16,36 +16,65 @@ from deepeval.prompt.api import (
16
16
  ###################################
17
17
 
18
18
 
19
- def interpolate_mustache(text: str, **kwargs) -> str:
19
+ def interpolate_mustache(text: str, **kwargs: Any) -> str:
20
20
  """Interpolate using Mustache format: {{variable}}"""
21
- formatted_template = re.sub(r"\{\{(\w+)\}\}", r"{\1}", text)
22
- return formatted_template.format(**kwargs)
23
21
 
22
+ def replace_match(match: Match[str]) -> str:
23
+ var_name = match.group(1)
24
+ if var_name in kwargs:
25
+ return str(kwargs[var_name])
26
+ # Raise error for missing variables to maintain consistency
27
+ raise KeyError(f"Missing variable in template: {var_name}")
24
28
 
25
- def interpolate_mustache_with_space(text: str, **kwargs) -> str:
29
+ return re.sub(r"\{\{([a-zA-Z_][a-zA-Z0-9_]*)\}\}", replace_match, text)
30
+
31
+
32
+ def interpolate_mustache_with_space(text: str, **kwargs: Any) -> str:
26
33
  """Interpolate using Mustache with space format: {{ variable }}"""
27
- formatted_template = re.sub(r"\{\{ (\w+) \}\}", r"{\1}", text)
28
- return formatted_template.format(**kwargs)
34
+
35
+ def replace_match(match: Match[str]) -> str:
36
+ var_name = match.group(1)
37
+ if var_name in kwargs:
38
+ return str(kwargs[var_name])
39
+ # Raise error for missing variables to maintain consistency
40
+ raise KeyError(f"Missing variable in template: {var_name}")
41
+
42
+ return re.sub(r"\{\{ ([a-zA-Z_][a-zA-Z0-9_]*) \}\}", replace_match, text)
29
43
 
30
44
 
31
- def interpolate_fstring(text: str, **kwargs) -> str:
45
+ def interpolate_fstring(text: str, **kwargs: Any) -> str:
32
46
  """Interpolate using F-string format: {variable}"""
33
- return text.format(**kwargs)
34
47
 
48
+ def replace_match(match: Match[str]) -> str:
49
+ var_name = match.group(1)
50
+ if var_name in kwargs:
51
+ return str(kwargs[var_name])
52
+ # Raise error for missing variables to maintain consistency
53
+ raise KeyError(f"Missing variable in template: {var_name}")
35
54
 
36
- def interpolate_dollar_brackets(text: str, **kwargs) -> str:
55
+ return re.sub(r"\{([a-zA-Z_][a-zA-Z0-9_]*)\}", replace_match, text)
56
+
57
+
58
+ def interpolate_dollar_brackets(text: str, **kwargs: Any) -> str:
37
59
  """Interpolate using Dollar Brackets format: ${variable}"""
38
- formatted_template = re.sub(r"\$\{(\w+)\}", r"{\1}", text)
39
- return formatted_template.format(**kwargs)
60
+
61
+ def replace_match(match: Match[str]) -> str:
62
+ var_name = match.group(1)
63
+ if var_name in kwargs:
64
+ return str(kwargs[var_name])
65
+ # Raise error for missing variables to maintain consistency
66
+ raise KeyError(f"Missing variable in template: {var_name}")
67
+
68
+ return re.sub(r"\$\{([a-zA-Z_][a-zA-Z0-9_]*)\}", replace_match, text)
40
69
 
41
70
 
42
- def interpolate_jinja(text: str, **kwargs) -> str:
71
+ def interpolate_jinja(text: str, **kwargs: Any) -> str:
43
72
  template = Template(text)
44
73
  return template.render(**kwargs)
45
74
 
46
75
 
47
76
  def interpolate_text(
48
- interpolation_type: PromptInterpolationType, text: str, **kwargs
77
+ interpolation_type: PromptInterpolationType, text: str, **kwargs: Any
49
78
  ) -> str:
50
79
  """Apply the appropriate interpolation method based on the type"""
51
80
  if interpolation_type == PromptInterpolationType.MUSTACHE:
@@ -35,7 +35,6 @@ class ConversationSimulator:
35
35
  self,
36
36
  model_callback: Callable[[str], str],
37
37
  simulator_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
38
- opening_message: Optional[str] = None,
39
38
  max_concurrent: int = 5,
40
39
  async_mode: bool = True,
41
40
  language: str = "English",
@@ -45,7 +44,6 @@ class ConversationSimulator:
45
44
  self.is_callback_async = inspect.iscoroutinefunction(
46
45
  self.model_callback
47
46
  )
48
- self.opening_message = opening_message
49
47
  self.semaphore = asyncio.Semaphore(max_concurrent)
50
48
  self.async_mode = async_mode
51
49
  self.language = language
@@ -68,6 +66,9 @@ class ConversationSimulator:
68
66
  self,
69
67
  conversational_goldens: List[ConversationalGolden],
70
68
  max_user_simulations: int = 10,
69
+ on_simulation_complete: Optional[
70
+ Callable[[ConversationalTestCase, int], None]
71
+ ] = None,
71
72
  ) -> List[ConversationalTestCase]:
72
73
  self.simulation_cost = 0 if self.using_native_model else None
73
74
 
@@ -87,6 +88,7 @@ class ConversationSimulator:
87
88
  self._a_simulate(
88
89
  conversational_goldens=conversational_goldens,
89
90
  max_user_simulations=max_user_simulations,
91
+ on_simulation_complete=on_simulation_complete,
90
92
  progress=progress,
91
93
  pbar_id=pbar_id,
92
94
  )
@@ -103,6 +105,7 @@ class ConversationSimulator:
103
105
  index=conversation_index,
104
106
  progress=progress,
105
107
  pbar_id=pbar_id,
108
+ on_simulation_complete=on_simulation_complete,
106
109
  )
107
110
  )
108
111
  conversational_test_cases.append(conversational_test_case)
@@ -115,6 +118,9 @@ class ConversationSimulator:
115
118
  self,
116
119
  conversational_goldens: List[ConversationalGolden],
117
120
  max_user_simulations: int,
121
+ on_simulation_complete: Optional[
122
+ Callable[[ConversationalTestCase, int], None]
123
+ ] = None,
118
124
  progress: Optional[Progress] = None,
119
125
  pbar_id: Optional[int] = None,
120
126
  ) -> List[ConversationalTestCase]:
@@ -131,6 +137,7 @@ class ConversationSimulator:
131
137
  index=conversation_index,
132
138
  progress=progress,
133
139
  pbar_id=pbar_id,
140
+ on_simulation_complete=on_simulation_complete,
134
141
  )
135
142
 
136
143
  tasks = [
@@ -150,6 +157,9 @@ class ConversationSimulator:
150
157
  index: int,
151
158
  progress: Optional[Progress] = None,
152
159
  pbar_id: Optional[int] = None,
160
+ on_simulation_complete: Optional[
161
+ Callable[[ConversationalTestCase, int], None]
162
+ ] = None,
153
163
  ) -> ConversationalTestCase:
154
164
  simulation_counter = 0
155
165
  if max_user_simulations <= 0:
@@ -166,8 +176,6 @@ class ConversationSimulator:
166
176
  user_input = None
167
177
  thread_id = str(uuid.uuid4())
168
178
  turns: List[Turn] = []
169
- if self.opening_message and golden.turns is None:
170
- turns.append(Turn(role="assistant", content=self.opening_message))
171
179
 
172
180
  if golden.turns is not None:
173
181
  turns.extend(golden.turns)
@@ -187,11 +195,7 @@ class ConversationSimulator:
187
195
  if simulation_counter >= max_user_simulations:
188
196
  update_pbar(progress, pbar_max_user_simluations_id)
189
197
  break
190
- if len(turns) == 0 or (
191
- len(turns) == 1
192
- and self.opening_message
193
- and golden.turns is None
194
- ):
198
+ if len(turns) == 0:
195
199
  # Generate first user input
196
200
  user_input = self.generate_first_user_input(golden)
197
201
  turns.append(Turn(role="user", content=user_input))
@@ -225,7 +229,7 @@ class ConversationSimulator:
225
229
  turns.append(turn)
226
230
 
227
231
  update_pbar(progress, pbar_id)
228
- return ConversationalTestCase(
232
+ conversational_test_case = ConversationalTestCase(
229
233
  turns=turns,
230
234
  scenario=golden.scenario,
231
235
  expected_outcome=golden.expected_outcome,
@@ -241,6 +245,9 @@ class ConversationSimulator:
241
245
  _dataset_alias=golden._dataset_alias,
242
246
  _dataset_id=golden._dataset_id,
243
247
  )
248
+ if on_simulation_complete:
249
+ on_simulation_complete(conversational_test_case, index)
250
+ return conversational_test_case
244
251
 
245
252
  async def _a_simulate_single_conversation(
246
253
  self,
@@ -249,6 +256,9 @@ class ConversationSimulator:
249
256
  index: Optional[int] = None,
250
257
  progress: Optional[Progress] = None,
251
258
  pbar_id: Optional[int] = None,
259
+ on_simulation_complete: Optional[
260
+ Callable[[ConversationalTestCase, int], None]
261
+ ] = None,
252
262
  ) -> ConversationalTestCase:
253
263
  simulation_counter = 0
254
264
  if max_user_simulations <= 0:
@@ -265,8 +275,6 @@ class ConversationSimulator:
265
275
  user_input = None
266
276
  thread_id = str(uuid.uuid4())
267
277
  turns: List[Turn] = []
268
- if self.opening_message and golden.turns is None:
269
- turns.append(Turn(role="assistant", content=self.opening_message))
270
278
 
271
279
  if golden.turns is not None:
272
280
  turns.extend(golden.turns)
@@ -286,11 +294,7 @@ class ConversationSimulator:
286
294
  if simulation_counter >= max_user_simulations:
287
295
  update_pbar(progress, pbar_max_user_simluations_id)
288
296
  break
289
- if len(turns) == 0 or (
290
- len(turns) == 1
291
- and self.opening_message
292
- and golden.turns is None
293
- ):
297
+ if len(turns) == 0:
294
298
  # Generate first user input
295
299
  user_input = await self.a_generate_first_user_input(golden)
296
300
  turns.append(Turn(role="user", content=user_input))
@@ -324,7 +328,7 @@ class ConversationSimulator:
324
328
  turns.append(turn)
325
329
 
326
330
  update_pbar(progress, pbar_id)
327
- return ConversationalTestCase(
331
+ conversational_test_case = ConversationalTestCase(
328
332
  turns=turns,
329
333
  scenario=golden.scenario,
330
334
  expected_outcome=golden.expected_outcome,
@@ -340,6 +344,9 @@ class ConversationSimulator:
340
344
  _dataset_alias=golden._dataset_alias,
341
345
  _dataset_id=golden._dataset_id,
342
346
  )
347
+ if on_simulation_complete:
348
+ on_simulation_complete(conversational_test_case, index)
349
+ return conversational_test_case
343
350
 
344
351
  ############################################
345
352
  ### Generate User Inputs ###################
@@ -249,8 +249,16 @@ class ContextGenerator:
249
249
 
250
250
  except Exception as exc:
251
251
  # record and continue with other docs
252
+ show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
253
+ exc_info = (
254
+ (type(exc), exc, getattr(exc, "__traceback__", None))
255
+ if show_trace
256
+ else None
257
+ )
252
258
  logger.exception(
253
- "Document pipeline failed for %s", path, exc_info=exc
259
+ "Document pipeline failed for %s",
260
+ path,
261
+ exc_info=exc_info,
254
262
  )
255
263
  finally:
256
264
  # drop the collection asap to avoid too many open collections
@@ -555,7 +555,7 @@ class Synthesizer:
555
555
  include_expected_output=include_expected_output,
556
556
  max_goldens_per_context=max_goldens_per_context,
557
557
  source_files=source_files,
558
- index=index,
558
+ context_index=index,
559
559
  progress=progress,
560
560
  pbar_id=pbar_id,
561
561
  context_scores=_context_scores,
@@ -577,7 +577,7 @@ class Synthesizer:
577
577
  include_expected_output: bool,
578
578
  max_goldens_per_context: int,
579
579
  source_files: Optional[List[str]],
580
- index: int,
580
+ context_index: int,
581
581
  progress: Optional[Progress] = None,
582
582
  pbar_id: Optional[int] = None,
583
583
  context_scores: Optional[List[float]] = None,
@@ -599,7 +599,7 @@ class Synthesizer:
599
599
  # Add pbars
600
600
  pbar_generate_goldens_id = add_pbar(
601
601
  progress,
602
- f"\t⚡ Generating goldens from context #{index}",
602
+ f"\t⚡ Generating goldens from context #{context_index}",
603
603
  total=1 + max_goldens_per_context,
604
604
  )
605
605
  pbar_generate_inputs_id = add_pbar(
@@ -643,7 +643,7 @@ class Synthesizer:
643
643
 
644
644
  # Helper function to process each input in parallel
645
645
  async def process_input(
646
- index: int,
646
+ input_index: int,
647
647
  data: SyntheticData,
648
648
  progress: Optional[Progress] = None,
649
649
  ):
@@ -654,7 +654,7 @@ class Synthesizer:
654
654
  num_evolutions=self.evolution_config.num_evolutions,
655
655
  evolutions=self.evolution_config.evolutions,
656
656
  progress=progress,
657
- pbar_evolve_input_id=pbar_evolve_input_ids[index],
657
+ pbar_evolve_input_id=pbar_evolve_input_ids[input_index],
658
658
  remove_pbar=False,
659
659
  )
660
660
 
@@ -672,7 +672,7 @@ class Synthesizer:
672
672
  )
673
673
  evolved_input = res.input
674
674
  update_pbar(
675
- progress, pbar_evolve_input_ids[index], remove=False
675
+ progress, pbar_evolve_input_ids[input_index], remove=False
676
676
  )
677
677
 
678
678
  # Generate expected output
@@ -685,7 +685,7 @@ class Synthesizer:
685
685
  )
686
686
  expected_output = await self._a_generate(expected_output_prompt)
687
687
  update_pbar(
688
- progress, pbar_evolve_input_ids[index], remove=False
688
+ progress, pbar_evolve_input_ids[input_index], remove=False
689
689
  )
690
690
 
691
691
  # Create Golden
@@ -694,13 +694,14 @@ class Synthesizer:
694
694
  context=context,
695
695
  expected_output=expected_output,
696
696
  source_file=(
697
- source_files[index]
698
- if source_files is not None and index < len(source_files)
697
+ source_files[context_index]
698
+ if source_files is not None
699
+ and context_index < len(source_files)
699
700
  else None
700
701
  ),
701
702
  additional_metadata={
702
703
  "evolutions": evolutions_used,
703
- "synthetic_input_quality": scores[index],
704
+ "synthetic_input_quality": scores[input_index],
704
705
  # "context_quality": (
705
706
  # context_scores[data_index]
706
707
  # if context_scores is not None