deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/anthropic/__init__.py +19 -0
  3. deepeval/anthropic/extractors.py +94 -0
  4. deepeval/anthropic/patch.py +169 -0
  5. deepeval/anthropic/utils.py +225 -0
  6. deepeval/benchmarks/drop/drop.py +40 -14
  7. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  8. deepeval/confident/types.py +4 -2
  9. deepeval/config/settings.py +258 -47
  10. deepeval/config/settings_manager.py +4 -0
  11. deepeval/config/utils.py +5 -0
  12. deepeval/dataset/dataset.py +162 -30
  13. deepeval/dataset/utils.py +41 -13
  14. deepeval/evaluate/execute.py +1099 -633
  15. deepeval/integrations/crewai/handler.py +36 -0
  16. deepeval/integrations/langchain/callback.py +27 -2
  17. deepeval/integrations/llama_index/handler.py +58 -4
  18. deepeval/integrations/llama_index/utils.py +24 -0
  19. deepeval/metrics/__init__.py +5 -0
  20. deepeval/metrics/exact_match/__init__.py +0 -0
  21. deepeval/metrics/exact_match/exact_match.py +94 -0
  22. deepeval/metrics/indicator.py +21 -1
  23. deepeval/metrics/pattern_match/__init__.py +0 -0
  24. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  25. deepeval/metrics/task_completion/task_completion.py +9 -2
  26. deepeval/model_integrations/__init__.py +0 -0
  27. deepeval/model_integrations/utils.py +116 -0
  28. deepeval/models/base_model.py +3 -1
  29. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  30. deepeval/models/llms/openai_model.py +10 -1
  31. deepeval/models/retry_policy.py +103 -20
  32. deepeval/openai/__init__.py +3 -1
  33. deepeval/openai/extractors.py +2 -2
  34. deepeval/openai/utils.py +7 -31
  35. deepeval/prompt/api.py +11 -10
  36. deepeval/prompt/prompt.py +5 -4
  37. deepeval/simulator/conversation_simulator.py +25 -18
  38. deepeval/synthesizer/chunking/context_generator.py +9 -1
  39. deepeval/telemetry.py +3 -3
  40. deepeval/test_case/llm_test_case.py +3 -2
  41. deepeval/test_run/api.py +3 -2
  42. deepeval/test_run/cache.py +4 -3
  43. deepeval/test_run/test_run.py +24 -5
  44. deepeval/tracing/api.py +11 -10
  45. deepeval/tracing/otel/exporter.py +11 -0
  46. deepeval/tracing/patchers.py +102 -1
  47. deepeval/tracing/trace_context.py +13 -4
  48. deepeval/tracing/tracing.py +10 -1
  49. deepeval/tracing/types.py +8 -8
  50. deepeval/tracing/utils.py +9 -0
  51. deepeval/utils.py +44 -2
  52. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
  53. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
  54. /deepeval/{openai → model_integrations}/types.py +0 -0
  55. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
  57. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
@@ -76,23 +76,26 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
76
76
  async def a_generate(
77
77
  self, prompt: str, schema: Optional[BaseModel] = None
78
78
  ) -> Tuple[Union[str, Dict], float]:
79
- payload = self.get_converse_request_body(prompt)
80
- client = await self._ensure_client()
81
- response = await client.converse(
82
- modelId=self.model_id,
83
- messages=payload["messages"],
84
- inferenceConfig=payload["inferenceConfig"],
85
- )
86
- message = response["output"]["message"]["content"][0]["text"]
87
- cost = self.calculate_cost(
88
- response["usage"]["inputTokens"],
89
- response["usage"]["outputTokens"],
90
- )
91
- if schema is None:
92
- return message, cost
93
- else:
94
- json_output = trim_and_load_json(message)
95
- return schema.model_validate(json_output), cost
79
+ try:
80
+ payload = self.get_converse_request_body(prompt)
81
+ client = await self._ensure_client()
82
+ response = await client.converse(
83
+ modelId=self.model_id,
84
+ messages=payload["messages"],
85
+ inferenceConfig=payload["inferenceConfig"],
86
+ )
87
+ message = response["output"]["message"]["content"][0]["text"]
88
+ cost = self.calculate_cost(
89
+ response["usage"]["inputTokens"],
90
+ response["usage"]["outputTokens"],
91
+ )
92
+ if schema is None:
93
+ return message, cost
94
+ else:
95
+ json_output = trim_and_load_json(message)
96
+ return schema.model_validate(json_output), cost
97
+ finally:
98
+ await self.close()
96
99
 
97
100
  ###############################################
98
101
  # Client management
@@ -8,6 +8,7 @@ from openai import (
8
8
  AsyncOpenAI,
9
9
  )
10
10
 
11
+ from deepeval.config.settings import get_settings
11
12
  from deepeval.constants import ProviderSlug as PS
12
13
  from deepeval.models import DeepEvalBaseLLM
13
14
  from deepeval.models.llms.utils import trim_and_load_json
@@ -209,6 +210,11 @@ models_requiring_temperature_1 = [
209
210
  ]
210
211
 
211
212
 
213
+ def _request_timeout_seconds() -> float:
214
+ timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
215
+ return timeout if timeout > 0 else 30.0
216
+
217
+
212
218
  class GPTModel(DeepEvalBaseLLM):
213
219
  def __init__(
214
220
  self,
@@ -387,7 +393,6 @@ class GPTModel(DeepEvalBaseLLM):
387
393
  )
388
394
  return schema.model_validate(json_output), cost
389
395
 
390
- client: AsyncOpenAI
391
396
  completion = await client.chat.completions.create(
392
397
  model=self.model_name,
393
398
  messages=[{"role": "user", "content": prompt}],
@@ -501,9 +506,13 @@ class GPTModel(DeepEvalBaseLLM):
501
506
  kwargs = dict(self.kwargs or {})
502
507
  if not sdk_retries_for(PS.OPENAI):
503
508
  kwargs["max_retries"] = 0
509
+
510
+ if not kwargs.get("timeout"):
511
+ kwargs["timeout"] = _request_timeout_seconds()
504
512
  return kwargs
505
513
 
506
514
  def _build_client(self, cls):
515
+
507
516
  kw = dict(
508
517
  api_key=self._openai_api_key,
509
518
  base_url=self.base_url,
@@ -39,6 +39,7 @@ import itertools
39
39
  import functools
40
40
  import threading
41
41
  import logging
42
+ import time
42
43
 
43
44
  from dataclasses import dataclass, field
44
45
  from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
@@ -52,6 +53,7 @@ from tenacity import (
52
53
  )
53
54
  from tenacity.stop import stop_base
54
55
  from tenacity.wait import wait_base
56
+ from contextvars import ContextVar, copy_context
55
57
 
56
58
  from deepeval.constants import (
57
59
  ProviderSlug as PS,
@@ -65,6 +67,81 @@ Provider = Union[str, PS]
65
67
  _MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
66
68
  _TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
67
69
  _WORKER_ID = itertools.count(1)
70
+ _OUTER_DEADLINE = ContextVar("deepeval_outer_deadline", default=None)
71
+
72
+
73
+ def set_outer_deadline(seconds: float | None):
74
+ """Set (or clear) the outer task time budget.
75
+
76
+ Stores a deadline in a local context variable so nested code
77
+ can cooperatively respect a shared budget. Always pair this with
78
+ `reset_outer_deadline(token)` in a `finally` block.
79
+
80
+ Args:
81
+ seconds: Number of seconds from now to set as the deadline. If `None`,
82
+ `0`, or a non-positive value is provided, the deadline is cleared.
83
+
84
+ Returns:
85
+ contextvars.Token: The token returned by the underlying ContextVar `.set()`
86
+ call, which must be passed to `reset_outer_deadline` to restore the
87
+ previous value.
88
+ """
89
+ if seconds and seconds > 0:
90
+ return _OUTER_DEADLINE.set(time.monotonic() + seconds)
91
+ return _OUTER_DEADLINE.set(None)
92
+
93
+
94
+ def reset_outer_deadline(token):
95
+ """Restore the previous outer deadline set by `set_outer_deadline`.
96
+
97
+ This should be called in a `finally` block to ensure the deadline
98
+ is restored even if an exception occurs.
99
+
100
+ Args:
101
+ token: The `contextvars.Token` returned by `set_outer_deadline`.
102
+ """
103
+ if token is not None:
104
+ _OUTER_DEADLINE.reset(token)
105
+
106
+
107
+ def _remaining_budget() -> float | None:
108
+ dl = _OUTER_DEADLINE.get()
109
+ if dl is None:
110
+ return None
111
+ return max(0.0, dl - time.monotonic())
112
+
113
+
114
+ def _is_budget_spent() -> bool:
115
+ rem = _remaining_budget()
116
+ return rem is not None and rem <= 0.0
117
+
118
+
119
+ def resolve_effective_attempt_timeout():
120
+ """Resolve the timeout to use for a single provider attempt.
121
+
122
+ Combines the configured per-attempt timeout with any remaining outer budget:
123
+ - If `DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS` is `0` or `None`, returns `0`
124
+ callers should skip `asyncio.wait_for` in this case and rely on the outer cap.
125
+ - If positive and an outer deadline is present, returns
126
+ `min(per_attempt, remaining_budget)`.
127
+ - If positive and no outer deadline is present, returns `per_attempt`.
128
+
129
+ Returns:
130
+ float: Seconds to use for the inner per-attempt timeout. `0` means
131
+ disable inner timeout and rely on the outer budget instead.
132
+ """
133
+ per_attempt = float(
134
+ get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
135
+ )
136
+ # 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
137
+ if per_attempt <= 0:
138
+ return 0
139
+ # If we do have a positive per-attempt, use up to remaining outer budget.
140
+ rem = _remaining_budget()
141
+ if rem is not None:
142
+ return max(0.0, min(per_attempt, rem))
143
+ return per_attempt
144
+
68
145
 
69
146
  # --------------------------
70
147
  # Policy description
@@ -399,9 +476,10 @@ def make_after_log(slug: str):
399
476
  if not _logger.isEnabledFor(after_level):
400
477
  return
401
478
 
479
+ show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
402
480
  exc_info = (
403
481
  (type(exc), exc, getattr(exc, "__traceback__", None))
404
- if after_level >= logging.ERROR
482
+ if show_trace
405
483
  else None
406
484
  )
407
485
 
@@ -416,7 +494,7 @@ def make_after_log(slug: str):
416
494
  return _after
417
495
 
418
496
 
419
- def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
497
+ def _make_timeout_error(timeout_seconds: float) -> asyncio.TimeoutError:
420
498
  settings = get_settings()
421
499
  if logger.isEnabledFor(logging.DEBUG):
422
500
  logger.debug(
@@ -427,12 +505,12 @@ def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
427
505
  )
428
506
  msg = (
429
507
  f"call timed out after {timeout_seconds:g}s (per attempt). "
430
- "Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS (0 disables) or reduce work per attempt."
508
+ "Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE (None disables) or reduce work per attempt."
431
509
  )
432
- return TimeoutError(msg)
510
+ return asyncio.TimeoutError(msg)
433
511
 
434
512
 
435
- def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
513
+ def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
436
514
  """
437
515
  Run a synchronous callable with a soft timeout enforced by a helper thread,
438
516
  with a global cap on concurrent timeout-workers.
@@ -499,9 +577,11 @@ def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
499
577
  done = threading.Event()
500
578
  result = {"value": None, "exc": None}
501
579
 
580
+ context = copy_context()
581
+
502
582
  def target():
503
583
  try:
504
- result["value"] = func(*args, **kwargs)
584
+ result["value"] = context.run(func, *args, **kwargs)
505
585
  except BaseException as e:
506
586
  result["exc"] = e
507
587
  finally:
@@ -562,37 +642,40 @@ def create_retry_decorator(provider: Provider):
562
642
 
563
643
  @functools.wraps(func)
564
644
  async def attempt(*args, **kwargs):
565
- timeout_seconds = (
566
- get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
567
- )
645
+ if _is_budget_spent():
646
+ raise _make_timeout_error(0)
647
+
648
+ per_attempt_timeout = resolve_effective_attempt_timeout()
649
+
568
650
  coro = func(*args, **kwargs)
569
- if timeout_seconds > 0:
651
+ if per_attempt_timeout > 0:
570
652
  try:
571
- return await asyncio.wait_for(coro, timeout_seconds)
572
- except asyncio.TimeoutError as e:
653
+ return await asyncio.wait_for(coro, per_attempt_timeout)
654
+ except (asyncio.TimeoutError, TimeoutError) as e:
573
655
  if (
574
656
  logger.isEnabledFor(logging.DEBUG)
575
657
  and get_settings().DEEPEVAL_VERBOSE_MODE is True
576
658
  ):
577
659
  logger.debug(
578
660
  "async timeout after %.3fs (active_threads=%d, tasks=%d)",
579
- timeout_seconds,
661
+ per_attempt_timeout,
580
662
  threading.active_count(),
581
663
  len(asyncio.all_tasks()),
582
664
  )
583
- raise _make_timeout_error(timeout_seconds) from e
665
+ raise _make_timeout_error(per_attempt_timeout) from e
584
666
  return await coro
585
667
 
586
668
  return base_retry(attempt)
587
669
 
588
670
  @functools.wraps(func)
589
671
  def attempt(*args, **kwargs):
590
- timeout_seconds = (
591
- get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
592
- )
593
- if timeout_seconds > 0:
594
- return _run_sync_with_timeout(
595
- func, timeout_seconds, *args, **kwargs
672
+ if _is_budget_spent():
673
+ raise _make_timeout_error(0)
674
+
675
+ per_attempt_timeout = resolve_effective_attempt_timeout()
676
+ if per_attempt_timeout > 0:
677
+ return run_sync_with_timeout(
678
+ func, per_attempt_timeout, *args, **kwargs
596
679
  )
597
680
  return func(*args, **kwargs)
598
681
 
@@ -15,5 +15,7 @@ except ImportError:
15
15
 
16
16
  if OpenAI or AsyncOpenAI:
17
17
  from deepeval.openai.patch import patch_openai_classes
18
+ from deepeval.telemetry import capture_tracing_integration
18
19
 
19
- patch_openai_classes()
20
+ with capture_tracing_integration("openai"):
21
+ patch_openai_classes()
@@ -4,13 +4,13 @@ from typing import Any, Union, Dict
4
4
  from openai.types.responses import Response
5
5
 
6
6
  from deepeval.test_case.llm_test_case import ToolCall
7
+
8
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
7
9
  from deepeval.openai.utils import (
8
10
  render_response_input,
9
11
  stringify_multimodal_content,
10
12
  render_messages,
11
13
  )
12
- from deepeval.openai.types import InputParameters, OutputParameters
13
- from deepeval.tracing.types import Message
14
14
 
15
15
 
16
16
  # guarding against errors to be compatible with legacy APIs
deepeval/openai/utils.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import uuid
3
- from typing import Any, Dict, List, Optional, Iterable
3
+ from typing import Any, Dict, List, Iterable
4
4
 
5
5
  from openai.types.chat.chat_completion_message_param import (
6
6
  ChatCompletionMessageParam,
@@ -8,32 +8,8 @@ from openai.types.chat.chat_completion_message_param import (
8
8
 
9
9
  from deepeval.tracing.types import ToolSpan, TraceSpanStatus
10
10
  from deepeval.tracing.context import current_span_context
11
- from deepeval.utils import shorten, len_long
12
- from deepeval.openai.types import OutputParameters
13
-
14
-
15
- _URL_MAX = 200
16
- _JSON_MAX = max(
17
- len_long(), 400
18
- ) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
19
-
20
-
21
- def _compact_dump(value: Any) -> str:
22
- try:
23
- dumped = json.dumps(
24
- value, ensure_ascii=False, default=str, separators=(",", ":")
25
- )
26
- except Exception:
27
- dumped = repr(value)
28
- return shorten(dumped, max_len=_JSON_MAX)
29
-
30
-
31
- def _fmt_url(url: Optional[str]) -> str:
32
- if not url:
33
- return ""
34
- if url.startswith("data:"):
35
- return "[data-uri]"
36
- return shorten(url, max_len=_URL_MAX)
11
+ from deepeval.model_integrations.types import OutputParameters
12
+ from deepeval.model_integrations.utils import compact_dump, fmt_url
37
13
 
38
14
 
39
15
  def create_child_tool_spans(output_parameters: OutputParameters):
@@ -111,7 +87,7 @@ def stringify_multimodal_content(content: Any) -> str:
111
87
  url = image_url
112
88
  else:
113
89
  url = (image_url or {}).get("url") or content.get("url")
114
- return f"[image:{_fmt_url(url)}]"
90
+ return f"[image:{fmt_url(url)}]"
115
91
 
116
92
  # Responses API variants
117
93
  if t == "input_text":
@@ -122,14 +98,14 @@ def stringify_multimodal_content(content: Any) -> str:
122
98
  url = image_url
123
99
  else:
124
100
  url = (image_url or {}).get("url") or content.get("url")
125
- return f"[image:{_fmt_url(url)}]"
101
+ return f"[image:{fmt_url(url)}]"
126
102
 
127
103
  # readability for other input_* types we don't currently handle
128
104
  if t and t.startswith("input_"):
129
105
  return f"[{t}]"
130
106
 
131
107
  # unknown dicts and types returned as shortened JSON
132
- return _compact_dump(content)
108
+ return compact_dump(content)
133
109
 
134
110
 
135
111
  def render_messages(
@@ -228,7 +204,7 @@ def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
228
204
  lines.append(f"{prefix}{key}:")
229
205
  lines.append(_render_content(value, indent + 1))
230
206
  elif isinstance(value, list):
231
- lines.append(f"{prefix}{key}: {_compact_dump(value)}")
207
+ lines.append(f"{prefix}{key}: {compact_dump(value)}")
232
208
  else:
233
209
  lines.append(f"{prefix}{key}: {value}")
234
210
 
deepeval/prompt/api.py CHANGED
@@ -1,8 +1,10 @@
1
- from pydantic import BaseModel, Field, AliasChoices
1
+ from pydantic import BaseModel, Field, AliasChoices, ConfigDict
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
4
  from pydantic import TypeAdapter
5
5
 
6
+ from deepeval.utils import make_model_config
7
+
6
8
  ###################################
7
9
  # Model Settings
8
10
  ###################################
@@ -92,6 +94,8 @@ class SchemaDataType(Enum):
92
94
 
93
95
 
94
96
  class OutputSchemaField(BaseModel):
97
+ model_config = make_model_config(use_enum_values=True)
98
+
95
99
  id: str
96
100
  type: SchemaDataType
97
101
  name: str
@@ -102,9 +106,6 @@ class OutputSchemaField(BaseModel):
102
106
  validation_alias=AliasChoices("parent_id", "parentId"),
103
107
  )
104
108
 
105
- class Config:
106
- use_enum_values = True
107
-
108
109
 
109
110
  class OutputSchema(BaseModel):
110
111
  fields: Optional[List[OutputSchemaField]] = None
@@ -187,6 +188,10 @@ class PromptHttpResponse(BaseModel):
187
188
 
188
189
 
189
190
  class PromptPushRequest(BaseModel):
191
+ model_config = make_model_config(use_enum_values=True)
192
+
193
+ model_config = ConfigDict(use_enum_values=True)
194
+
190
195
  alias: str
191
196
  text: Optional[str] = None
192
197
  messages: Optional[List[PromptMessage]] = None
@@ -203,11 +208,10 @@ class PromptPushRequest(BaseModel):
203
208
  default=None, serialization_alias="outputType"
204
209
  )
205
210
 
206
- class Config:
207
- use_enum_values = True
208
-
209
211
 
210
212
  class PromptUpdateRequest(BaseModel):
213
+ model_config = make_model_config(use_enum_values=True)
214
+
211
215
  text: Optional[str] = None
212
216
  messages: Optional[List[PromptMessage]] = None
213
217
  interpolation_type: PromptInterpolationType = Field(
@@ -223,9 +227,6 @@ class PromptUpdateRequest(BaseModel):
223
227
  default=None, serialization_alias="outputType"
224
228
  )
225
229
 
226
- class Config:
227
- use_enum_values = True
228
-
229
230
 
230
231
  class PromptApi(BaseModel):
231
232
  id: str
deepeval/prompt/prompt.py CHANGED
@@ -5,11 +5,13 @@ from rich.console import Console
5
5
  import time
6
6
  import json
7
7
  import os
8
- from pydantic import BaseModel, ValidationError
8
+ from pydantic import BaseModel, ValidationError, ConfigDict
9
9
  import asyncio
10
10
  import portalocker
11
11
  import threading
12
12
 
13
+ from deepeval.utils import make_model_config
14
+
13
15
  from deepeval.prompt.api import (
14
16
  PromptHttpResponse,
15
17
  PromptMessage,
@@ -77,6 +79,8 @@ class CustomEncoder(json.JSONEncoder):
77
79
 
78
80
 
79
81
  class CachedPrompt(BaseModel):
82
+ model_config = make_model_config(use_enum_values=True)
83
+
80
84
  alias: str
81
85
  version: str
82
86
  label: Optional[str] = None
@@ -89,9 +93,6 @@ class CachedPrompt(BaseModel):
89
93
  output_type: Optional[OutputType]
90
94
  output_schema: Optional[OutputSchema]
91
95
 
92
- class Config:
93
- use_enum_values = True
94
-
95
96
 
96
97
  class Prompt:
97
98
 
@@ -35,7 +35,6 @@ class ConversationSimulator:
35
35
  self,
36
36
  model_callback: Callable[[str], str],
37
37
  simulator_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
38
- opening_message: Optional[str] = None,
39
38
  max_concurrent: int = 5,
40
39
  async_mode: bool = True,
41
40
  language: str = "English",
@@ -45,7 +44,6 @@ class ConversationSimulator:
45
44
  self.is_callback_async = inspect.iscoroutinefunction(
46
45
  self.model_callback
47
46
  )
48
- self.opening_message = opening_message
49
47
  self.semaphore = asyncio.Semaphore(max_concurrent)
50
48
  self.async_mode = async_mode
51
49
  self.language = language
@@ -68,6 +66,9 @@ class ConversationSimulator:
68
66
  self,
69
67
  conversational_goldens: List[ConversationalGolden],
70
68
  max_user_simulations: int = 10,
69
+ on_simulation_complete: Optional[
70
+ Callable[[ConversationalTestCase, int], None]
71
+ ] = None,
71
72
  ) -> List[ConversationalTestCase]:
72
73
  self.simulation_cost = 0 if self.using_native_model else None
73
74
 
@@ -87,6 +88,7 @@ class ConversationSimulator:
87
88
  self._a_simulate(
88
89
  conversational_goldens=conversational_goldens,
89
90
  max_user_simulations=max_user_simulations,
91
+ on_simulation_complete=on_simulation_complete,
90
92
  progress=progress,
91
93
  pbar_id=pbar_id,
92
94
  )
@@ -103,6 +105,7 @@ class ConversationSimulator:
103
105
  index=conversation_index,
104
106
  progress=progress,
105
107
  pbar_id=pbar_id,
108
+ on_simulation_complete=on_simulation_complete,
106
109
  )
107
110
  )
108
111
  conversational_test_cases.append(conversational_test_case)
@@ -115,6 +118,9 @@ class ConversationSimulator:
115
118
  self,
116
119
  conversational_goldens: List[ConversationalGolden],
117
120
  max_user_simulations: int,
121
+ on_simulation_complete: Optional[
122
+ Callable[[ConversationalTestCase, int], None]
123
+ ] = None,
118
124
  progress: Optional[Progress] = None,
119
125
  pbar_id: Optional[int] = None,
120
126
  ) -> List[ConversationalTestCase]:
@@ -131,6 +137,7 @@ class ConversationSimulator:
131
137
  index=conversation_index,
132
138
  progress=progress,
133
139
  pbar_id=pbar_id,
140
+ on_simulation_complete=on_simulation_complete,
134
141
  )
135
142
 
136
143
  tasks = [
@@ -150,6 +157,9 @@ class ConversationSimulator:
150
157
  index: int,
151
158
  progress: Optional[Progress] = None,
152
159
  pbar_id: Optional[int] = None,
160
+ on_simulation_complete: Optional[
161
+ Callable[[ConversationalTestCase, int], None]
162
+ ] = None,
153
163
  ) -> ConversationalTestCase:
154
164
  simulation_counter = 0
155
165
  if max_user_simulations <= 0:
@@ -166,8 +176,6 @@ class ConversationSimulator:
166
176
  user_input = None
167
177
  thread_id = str(uuid.uuid4())
168
178
  turns: List[Turn] = []
169
- if self.opening_message and golden.turns is None:
170
- turns.append(Turn(role="assistant", content=self.opening_message))
171
179
 
172
180
  if golden.turns is not None:
173
181
  turns.extend(golden.turns)
@@ -187,11 +195,7 @@ class ConversationSimulator:
187
195
  if simulation_counter >= max_user_simulations:
188
196
  update_pbar(progress, pbar_max_user_simluations_id)
189
197
  break
190
- if len(turns) == 0 or (
191
- len(turns) == 1
192
- and self.opening_message
193
- and golden.turns is None
194
- ):
198
+ if len(turns) == 0:
195
199
  # Generate first user input
196
200
  user_input = self.generate_first_user_input(golden)
197
201
  turns.append(Turn(role="user", content=user_input))
@@ -225,7 +229,7 @@ class ConversationSimulator:
225
229
  turns.append(turn)
226
230
 
227
231
  update_pbar(progress, pbar_id)
228
- return ConversationalTestCase(
232
+ conversational_test_case = ConversationalTestCase(
229
233
  turns=turns,
230
234
  scenario=golden.scenario,
231
235
  expected_outcome=golden.expected_outcome,
@@ -241,6 +245,9 @@ class ConversationSimulator:
241
245
  _dataset_alias=golden._dataset_alias,
242
246
  _dataset_id=golden._dataset_id,
243
247
  )
248
+ if on_simulation_complete:
249
+ on_simulation_complete(conversational_test_case, index)
250
+ return conversational_test_case
244
251
 
245
252
  async def _a_simulate_single_conversation(
246
253
  self,
@@ -249,6 +256,9 @@ class ConversationSimulator:
249
256
  index: Optional[int] = None,
250
257
  progress: Optional[Progress] = None,
251
258
  pbar_id: Optional[int] = None,
259
+ on_simulation_complete: Optional[
260
+ Callable[[ConversationalTestCase, int], None]
261
+ ] = None,
252
262
  ) -> ConversationalTestCase:
253
263
  simulation_counter = 0
254
264
  if max_user_simulations <= 0:
@@ -265,8 +275,6 @@ class ConversationSimulator:
265
275
  user_input = None
266
276
  thread_id = str(uuid.uuid4())
267
277
  turns: List[Turn] = []
268
- if self.opening_message and golden.turns is None:
269
- turns.append(Turn(role="assistant", content=self.opening_message))
270
278
 
271
279
  if golden.turns is not None:
272
280
  turns.extend(golden.turns)
@@ -286,11 +294,7 @@ class ConversationSimulator:
286
294
  if simulation_counter >= max_user_simulations:
287
295
  update_pbar(progress, pbar_max_user_simluations_id)
288
296
  break
289
- if len(turns) == 0 or (
290
- len(turns) == 1
291
- and self.opening_message
292
- and golden.turns is None
293
- ):
297
+ if len(turns) == 0:
294
298
  # Generate first user input
295
299
  user_input = await self.a_generate_first_user_input(golden)
296
300
  turns.append(Turn(role="user", content=user_input))
@@ -324,7 +328,7 @@ class ConversationSimulator:
324
328
  turns.append(turn)
325
329
 
326
330
  update_pbar(progress, pbar_id)
327
- return ConversationalTestCase(
331
+ conversational_test_case = ConversationalTestCase(
328
332
  turns=turns,
329
333
  scenario=golden.scenario,
330
334
  expected_outcome=golden.expected_outcome,
@@ -340,6 +344,9 @@ class ConversationSimulator:
340
344
  _dataset_alias=golden._dataset_alias,
341
345
  _dataset_id=golden._dataset_id,
342
346
  )
347
+ if on_simulation_complete:
348
+ on_simulation_complete(conversational_test_case, index)
349
+ return conversational_test_case
343
350
 
344
351
  ############################################
345
352
  ### Generate User Inputs ###################
@@ -249,8 +249,16 @@ class ContextGenerator:
249
249
 
250
250
  except Exception as exc:
251
251
  # record and continue with other docs
252
+ show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
253
+ exc_info = (
254
+ (type(exc), exc, getattr(exc, "__traceback__", None))
255
+ if show_trace
256
+ else None
257
+ )
252
258
  logger.exception(
253
- "Document pipeline failed for %s", path, exc_info=exc
259
+ "Document pipeline failed for %s",
260
+ path,
261
+ exc_info=exc_info,
254
262
  )
255
263
  finally:
256
264
  # drop the collection asap to avoid too many open collections