deepeval 3.6.8__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +104 -36
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/evaluate/execute.py +1099 -633
- deepeval/metrics/indicator.py +21 -1
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- {deepeval-3.6.8.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
- {deepeval-3.6.8.dist-info → deepeval-3.6.9.dist-info}/RECORD +17 -17
- {deepeval-3.6.8.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
deepeval/metrics/indicator.py
CHANGED
|
@@ -18,6 +18,10 @@ from deepeval.test_run.cache import CachedTestCase, Cache
|
|
|
18
18
|
from deepeval.telemetry import capture_metric_type
|
|
19
19
|
from deepeval.utils import update_pbar
|
|
20
20
|
|
|
21
|
+
import logging
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
def format_metric_description(
|
|
23
27
|
metric: Union[BaseMetric, BaseConversationalMetric, BaseArenaMetric],
|
|
@@ -43,7 +47,7 @@ def metric_progress_indicator(
|
|
|
43
47
|
_show_indicator: bool = True,
|
|
44
48
|
_in_component: bool = False,
|
|
45
49
|
):
|
|
46
|
-
captured_async_mode = False if async_mode
|
|
50
|
+
captured_async_mode = False if async_mode is None else async_mode
|
|
47
51
|
with capture_metric_type(
|
|
48
52
|
metric.__name__,
|
|
49
53
|
async_mode=captured_async_mode,
|
|
@@ -250,6 +254,21 @@ async def safe_a_measure(
|
|
|
250
254
|
_log_metric_to_confident=False,
|
|
251
255
|
)
|
|
252
256
|
update_pbar(progress, pbar_eval_id)
|
|
257
|
+
|
|
258
|
+
except asyncio.CancelledError:
|
|
259
|
+
logger.info("caught asyncio.CancelledError")
|
|
260
|
+
|
|
261
|
+
# treat cancellation as a timeout so we still emit a MetricData
|
|
262
|
+
metric.error = (
|
|
263
|
+
"Timed out/cancelled while evaluating metric. "
|
|
264
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
265
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
266
|
+
)
|
|
267
|
+
metric.success = False
|
|
268
|
+
|
|
269
|
+
if not ignore_errors:
|
|
270
|
+
raise
|
|
271
|
+
|
|
253
272
|
except MissingTestCaseParamsError as e:
|
|
254
273
|
if skip_on_missing_params:
|
|
255
274
|
metric.skipped = True
|
|
@@ -277,5 +296,6 @@ async def safe_a_measure(
|
|
|
277
296
|
if ignore_errors:
|
|
278
297
|
metric.error = str(e)
|
|
279
298
|
metric.success = False # Assuming you want to set success to False
|
|
299
|
+
logger.info("a metric was marked as errored")
|
|
280
300
|
else:
|
|
281
301
|
raise
|
|
@@ -76,23 +76,26 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
76
76
|
async def a_generate(
|
|
77
77
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
78
78
|
) -> Tuple[Union[str, Dict], float]:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
79
|
+
try:
|
|
80
|
+
payload = self.get_converse_request_body(prompt)
|
|
81
|
+
client = await self._ensure_client()
|
|
82
|
+
response = await client.converse(
|
|
83
|
+
modelId=self.model_id,
|
|
84
|
+
messages=payload["messages"],
|
|
85
|
+
inferenceConfig=payload["inferenceConfig"],
|
|
86
|
+
)
|
|
87
|
+
message = response["output"]["message"]["content"][0]["text"]
|
|
88
|
+
cost = self.calculate_cost(
|
|
89
|
+
response["usage"]["inputTokens"],
|
|
90
|
+
response["usage"]["outputTokens"],
|
|
91
|
+
)
|
|
92
|
+
if schema is None:
|
|
93
|
+
return message, cost
|
|
94
|
+
else:
|
|
95
|
+
json_output = trim_and_load_json(message)
|
|
96
|
+
return schema.model_validate(json_output), cost
|
|
97
|
+
finally:
|
|
98
|
+
await self.close()
|
|
96
99
|
|
|
97
100
|
###############################################
|
|
98
101
|
# Client management
|
|
@@ -8,6 +8,7 @@ from openai import (
|
|
|
8
8
|
AsyncOpenAI,
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
+
from deepeval.config.settings import get_settings
|
|
11
12
|
from deepeval.constants import ProviderSlug as PS
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
@@ -209,6 +210,11 @@ models_requiring_temperature_1 = [
|
|
|
209
210
|
]
|
|
210
211
|
|
|
211
212
|
|
|
213
|
+
def _request_timeout_seconds() -> float:
|
|
214
|
+
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
215
|
+
return timeout if timeout > 0 else 30.0
|
|
216
|
+
|
|
217
|
+
|
|
212
218
|
class GPTModel(DeepEvalBaseLLM):
|
|
213
219
|
def __init__(
|
|
214
220
|
self,
|
|
@@ -387,7 +393,6 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
387
393
|
)
|
|
388
394
|
return schema.model_validate(json_output), cost
|
|
389
395
|
|
|
390
|
-
client: AsyncOpenAI
|
|
391
396
|
completion = await client.chat.completions.create(
|
|
392
397
|
model=self.model_name,
|
|
393
398
|
messages=[{"role": "user", "content": prompt}],
|
|
@@ -501,9 +506,13 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
501
506
|
kwargs = dict(self.kwargs or {})
|
|
502
507
|
if not sdk_retries_for(PS.OPENAI):
|
|
503
508
|
kwargs["max_retries"] = 0
|
|
509
|
+
|
|
510
|
+
if not kwargs.get("timeout"):
|
|
511
|
+
kwargs["timeout"] = _request_timeout_seconds()
|
|
504
512
|
return kwargs
|
|
505
513
|
|
|
506
514
|
def _build_client(self, cls):
|
|
515
|
+
|
|
507
516
|
kw = dict(
|
|
508
517
|
api_key=self._openai_api_key,
|
|
509
518
|
base_url=self.base_url,
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -39,6 +39,7 @@ import itertools
|
|
|
39
39
|
import functools
|
|
40
40
|
import threading
|
|
41
41
|
import logging
|
|
42
|
+
import time
|
|
42
43
|
|
|
43
44
|
from dataclasses import dataclass, field
|
|
44
45
|
from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
|
|
@@ -52,6 +53,7 @@ from tenacity import (
|
|
|
52
53
|
)
|
|
53
54
|
from tenacity.stop import stop_base
|
|
54
55
|
from tenacity.wait import wait_base
|
|
56
|
+
from contextvars import ContextVar, copy_context
|
|
55
57
|
|
|
56
58
|
from deepeval.constants import (
|
|
57
59
|
ProviderSlug as PS,
|
|
@@ -65,6 +67,81 @@ Provider = Union[str, PS]
|
|
|
65
67
|
_MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
|
|
66
68
|
_TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
|
|
67
69
|
_WORKER_ID = itertools.count(1)
|
|
70
|
+
_OUTER_DEADLINE = ContextVar("deepeval_outer_deadline", default=None)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def set_outer_deadline(seconds: float | None):
|
|
74
|
+
"""Set (or clear) the outer task time budget.
|
|
75
|
+
|
|
76
|
+
Stores a deadline in a local context variable so nested code
|
|
77
|
+
can cooperatively respect a shared budget. Always pair this with
|
|
78
|
+
`reset_outer_deadline(token)` in a `finally` block.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
seconds: Number of seconds from now to set as the deadline. If `None`,
|
|
82
|
+
`0`, or a non-positive value is provided, the deadline is cleared.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
contextvars.Token: The token returned by the underlying ContextVar `.set()`
|
|
86
|
+
call, which must be passed to `reset_outer_deadline` to restore the
|
|
87
|
+
previous value.
|
|
88
|
+
"""
|
|
89
|
+
if seconds and seconds > 0:
|
|
90
|
+
return _OUTER_DEADLINE.set(time.monotonic() + seconds)
|
|
91
|
+
return _OUTER_DEADLINE.set(None)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def reset_outer_deadline(token):
|
|
95
|
+
"""Restore the previous outer deadline set by `set_outer_deadline`.
|
|
96
|
+
|
|
97
|
+
This should be called in a `finally` block to ensure the deadline
|
|
98
|
+
is restored even if an exception occurs.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
token: The `contextvars.Token` returned by `set_outer_deadline`.
|
|
102
|
+
"""
|
|
103
|
+
if token is not None:
|
|
104
|
+
_OUTER_DEADLINE.reset(token)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _remaining_budget() -> float | None:
|
|
108
|
+
dl = _OUTER_DEADLINE.get()
|
|
109
|
+
if dl is None:
|
|
110
|
+
return None
|
|
111
|
+
return max(0.0, dl - time.monotonic())
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _is_budget_spent() -> bool:
|
|
115
|
+
rem = _remaining_budget()
|
|
116
|
+
return rem is not None and rem <= 0.0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def resolve_effective_attempt_timeout():
|
|
120
|
+
"""Resolve the timeout to use for a single provider attempt.
|
|
121
|
+
|
|
122
|
+
Combines the configured per-attempt timeout with any remaining outer budget:
|
|
123
|
+
- If `DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS` is `0` or `None`, returns `0`
|
|
124
|
+
callers should skip `asyncio.wait_for` in this case and rely on the outer cap.
|
|
125
|
+
- If positive and an outer deadline is present, returns
|
|
126
|
+
`min(per_attempt, remaining_budget)`.
|
|
127
|
+
- If positive and no outer deadline is present, returns `per_attempt`.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
float: Seconds to use for the inner per-attempt timeout. `0` means
|
|
131
|
+
disable inner timeout and rely on the outer budget instead.
|
|
132
|
+
"""
|
|
133
|
+
per_attempt = float(
|
|
134
|
+
get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
|
|
135
|
+
)
|
|
136
|
+
# 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
|
|
137
|
+
if per_attempt <= 0:
|
|
138
|
+
return 0
|
|
139
|
+
# If we do have a positive per-attempt, use up to remaining outer budget.
|
|
140
|
+
rem = _remaining_budget()
|
|
141
|
+
if rem is not None:
|
|
142
|
+
return max(0.0, min(per_attempt, rem))
|
|
143
|
+
return per_attempt
|
|
144
|
+
|
|
68
145
|
|
|
69
146
|
# --------------------------
|
|
70
147
|
# Policy description
|
|
@@ -399,9 +476,10 @@ def make_after_log(slug: str):
|
|
|
399
476
|
if not _logger.isEnabledFor(after_level):
|
|
400
477
|
return
|
|
401
478
|
|
|
479
|
+
show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
|
|
402
480
|
exc_info = (
|
|
403
481
|
(type(exc), exc, getattr(exc, "__traceback__", None))
|
|
404
|
-
if
|
|
482
|
+
if show_trace
|
|
405
483
|
else None
|
|
406
484
|
)
|
|
407
485
|
|
|
@@ -416,7 +494,7 @@ def make_after_log(slug: str):
|
|
|
416
494
|
return _after
|
|
417
495
|
|
|
418
496
|
|
|
419
|
-
def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
|
|
497
|
+
def _make_timeout_error(timeout_seconds: float) -> asyncio.TimeoutError:
|
|
420
498
|
settings = get_settings()
|
|
421
499
|
if logger.isEnabledFor(logging.DEBUG):
|
|
422
500
|
logger.debug(
|
|
@@ -427,12 +505,12 @@ def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
|
|
|
427
505
|
)
|
|
428
506
|
msg = (
|
|
429
507
|
f"call timed out after {timeout_seconds:g}s (per attempt). "
|
|
430
|
-
"Increase
|
|
508
|
+
"Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE (None disables) or reduce work per attempt."
|
|
431
509
|
)
|
|
432
|
-
return TimeoutError(msg)
|
|
510
|
+
return asyncio.TimeoutError(msg)
|
|
433
511
|
|
|
434
512
|
|
|
435
|
-
def
|
|
513
|
+
def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
436
514
|
"""
|
|
437
515
|
Run a synchronous callable with a soft timeout enforced by a helper thread,
|
|
438
516
|
with a global cap on concurrent timeout-workers.
|
|
@@ -499,9 +577,11 @@ def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
|
499
577
|
done = threading.Event()
|
|
500
578
|
result = {"value": None, "exc": None}
|
|
501
579
|
|
|
580
|
+
context = copy_context()
|
|
581
|
+
|
|
502
582
|
def target():
|
|
503
583
|
try:
|
|
504
|
-
result["value"] = func
|
|
584
|
+
result["value"] = context.run(func, *args, **kwargs)
|
|
505
585
|
except BaseException as e:
|
|
506
586
|
result["exc"] = e
|
|
507
587
|
finally:
|
|
@@ -562,37 +642,40 @@ def create_retry_decorator(provider: Provider):
|
|
|
562
642
|
|
|
563
643
|
@functools.wraps(func)
|
|
564
644
|
async def attempt(*args, **kwargs):
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
645
|
+
if _is_budget_spent():
|
|
646
|
+
raise _make_timeout_error(0)
|
|
647
|
+
|
|
648
|
+
per_attempt_timeout = resolve_effective_attempt_timeout()
|
|
649
|
+
|
|
568
650
|
coro = func(*args, **kwargs)
|
|
569
|
-
if
|
|
651
|
+
if per_attempt_timeout > 0:
|
|
570
652
|
try:
|
|
571
|
-
return await asyncio.wait_for(coro,
|
|
572
|
-
except asyncio.TimeoutError as e:
|
|
653
|
+
return await asyncio.wait_for(coro, per_attempt_timeout)
|
|
654
|
+
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
573
655
|
if (
|
|
574
656
|
logger.isEnabledFor(logging.DEBUG)
|
|
575
657
|
and get_settings().DEEPEVAL_VERBOSE_MODE is True
|
|
576
658
|
):
|
|
577
659
|
logger.debug(
|
|
578
660
|
"async timeout after %.3fs (active_threads=%d, tasks=%d)",
|
|
579
|
-
|
|
661
|
+
per_attempt_timeout,
|
|
580
662
|
threading.active_count(),
|
|
581
663
|
len(asyncio.all_tasks()),
|
|
582
664
|
)
|
|
583
|
-
raise _make_timeout_error(
|
|
665
|
+
raise _make_timeout_error(per_attempt_timeout) from e
|
|
584
666
|
return await coro
|
|
585
667
|
|
|
586
668
|
return base_retry(attempt)
|
|
587
669
|
|
|
588
670
|
@functools.wraps(func)
|
|
589
671
|
def attempt(*args, **kwargs):
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
672
|
+
if _is_budget_spent():
|
|
673
|
+
raise _make_timeout_error(0)
|
|
674
|
+
|
|
675
|
+
per_attempt_timeout = resolve_effective_attempt_timeout()
|
|
676
|
+
if per_attempt_timeout > 0:
|
|
677
|
+
return run_sync_with_timeout(
|
|
678
|
+
func, per_attempt_timeout, *args, **kwargs
|
|
596
679
|
)
|
|
597
680
|
return func(*args, **kwargs)
|
|
598
681
|
|
|
@@ -35,7 +35,6 @@ class ConversationSimulator:
|
|
|
35
35
|
self,
|
|
36
36
|
model_callback: Callable[[str], str],
|
|
37
37
|
simulator_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
38
|
-
opening_message: Optional[str] = None,
|
|
39
38
|
max_concurrent: int = 5,
|
|
40
39
|
async_mode: bool = True,
|
|
41
40
|
language: str = "English",
|
|
@@ -45,7 +44,6 @@ class ConversationSimulator:
|
|
|
45
44
|
self.is_callback_async = inspect.iscoroutinefunction(
|
|
46
45
|
self.model_callback
|
|
47
46
|
)
|
|
48
|
-
self.opening_message = opening_message
|
|
49
47
|
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
50
48
|
self.async_mode = async_mode
|
|
51
49
|
self.language = language
|
|
@@ -68,6 +66,9 @@ class ConversationSimulator:
|
|
|
68
66
|
self,
|
|
69
67
|
conversational_goldens: List[ConversationalGolden],
|
|
70
68
|
max_user_simulations: int = 10,
|
|
69
|
+
on_simulation_complete: Optional[
|
|
70
|
+
Callable[[ConversationalTestCase, int], None]
|
|
71
|
+
] = None,
|
|
71
72
|
) -> List[ConversationalTestCase]:
|
|
72
73
|
self.simulation_cost = 0 if self.using_native_model else None
|
|
73
74
|
|
|
@@ -87,6 +88,7 @@ class ConversationSimulator:
|
|
|
87
88
|
self._a_simulate(
|
|
88
89
|
conversational_goldens=conversational_goldens,
|
|
89
90
|
max_user_simulations=max_user_simulations,
|
|
91
|
+
on_simulation_complete=on_simulation_complete,
|
|
90
92
|
progress=progress,
|
|
91
93
|
pbar_id=pbar_id,
|
|
92
94
|
)
|
|
@@ -103,6 +105,7 @@ class ConversationSimulator:
|
|
|
103
105
|
index=conversation_index,
|
|
104
106
|
progress=progress,
|
|
105
107
|
pbar_id=pbar_id,
|
|
108
|
+
on_simulation_complete=on_simulation_complete,
|
|
106
109
|
)
|
|
107
110
|
)
|
|
108
111
|
conversational_test_cases.append(conversational_test_case)
|
|
@@ -115,6 +118,9 @@ class ConversationSimulator:
|
|
|
115
118
|
self,
|
|
116
119
|
conversational_goldens: List[ConversationalGolden],
|
|
117
120
|
max_user_simulations: int,
|
|
121
|
+
on_simulation_complete: Optional[
|
|
122
|
+
Callable[[ConversationalTestCase, int], None]
|
|
123
|
+
] = None,
|
|
118
124
|
progress: Optional[Progress] = None,
|
|
119
125
|
pbar_id: Optional[int] = None,
|
|
120
126
|
) -> List[ConversationalTestCase]:
|
|
@@ -131,6 +137,7 @@ class ConversationSimulator:
|
|
|
131
137
|
index=conversation_index,
|
|
132
138
|
progress=progress,
|
|
133
139
|
pbar_id=pbar_id,
|
|
140
|
+
on_simulation_complete=on_simulation_complete,
|
|
134
141
|
)
|
|
135
142
|
|
|
136
143
|
tasks = [
|
|
@@ -150,6 +157,9 @@ class ConversationSimulator:
|
|
|
150
157
|
index: int,
|
|
151
158
|
progress: Optional[Progress] = None,
|
|
152
159
|
pbar_id: Optional[int] = None,
|
|
160
|
+
on_simulation_complete: Optional[
|
|
161
|
+
Callable[[ConversationalTestCase, int], None]
|
|
162
|
+
] = None,
|
|
153
163
|
) -> ConversationalTestCase:
|
|
154
164
|
simulation_counter = 0
|
|
155
165
|
if max_user_simulations <= 0:
|
|
@@ -166,8 +176,6 @@ class ConversationSimulator:
|
|
|
166
176
|
user_input = None
|
|
167
177
|
thread_id = str(uuid.uuid4())
|
|
168
178
|
turns: List[Turn] = []
|
|
169
|
-
if self.opening_message and golden.turns is None:
|
|
170
|
-
turns.append(Turn(role="assistant", content=self.opening_message))
|
|
171
179
|
|
|
172
180
|
if golden.turns is not None:
|
|
173
181
|
turns.extend(golden.turns)
|
|
@@ -187,11 +195,7 @@ class ConversationSimulator:
|
|
|
187
195
|
if simulation_counter >= max_user_simulations:
|
|
188
196
|
update_pbar(progress, pbar_max_user_simluations_id)
|
|
189
197
|
break
|
|
190
|
-
if len(turns) == 0
|
|
191
|
-
len(turns) == 1
|
|
192
|
-
and self.opening_message
|
|
193
|
-
and golden.turns is None
|
|
194
|
-
):
|
|
198
|
+
if len(turns) == 0:
|
|
195
199
|
# Generate first user input
|
|
196
200
|
user_input = self.generate_first_user_input(golden)
|
|
197
201
|
turns.append(Turn(role="user", content=user_input))
|
|
@@ -225,7 +229,7 @@ class ConversationSimulator:
|
|
|
225
229
|
turns.append(turn)
|
|
226
230
|
|
|
227
231
|
update_pbar(progress, pbar_id)
|
|
228
|
-
|
|
232
|
+
conversational_test_case = ConversationalTestCase(
|
|
229
233
|
turns=turns,
|
|
230
234
|
scenario=golden.scenario,
|
|
231
235
|
expected_outcome=golden.expected_outcome,
|
|
@@ -241,6 +245,9 @@ class ConversationSimulator:
|
|
|
241
245
|
_dataset_alias=golden._dataset_alias,
|
|
242
246
|
_dataset_id=golden._dataset_id,
|
|
243
247
|
)
|
|
248
|
+
if on_simulation_complete:
|
|
249
|
+
on_simulation_complete(conversational_test_case, index)
|
|
250
|
+
return conversational_test_case
|
|
244
251
|
|
|
245
252
|
async def _a_simulate_single_conversation(
|
|
246
253
|
self,
|
|
@@ -249,6 +256,9 @@ class ConversationSimulator:
|
|
|
249
256
|
index: Optional[int] = None,
|
|
250
257
|
progress: Optional[Progress] = None,
|
|
251
258
|
pbar_id: Optional[int] = None,
|
|
259
|
+
on_simulation_complete: Optional[
|
|
260
|
+
Callable[[ConversationalTestCase, int], None]
|
|
261
|
+
] = None,
|
|
252
262
|
) -> ConversationalTestCase:
|
|
253
263
|
simulation_counter = 0
|
|
254
264
|
if max_user_simulations <= 0:
|
|
@@ -265,8 +275,6 @@ class ConversationSimulator:
|
|
|
265
275
|
user_input = None
|
|
266
276
|
thread_id = str(uuid.uuid4())
|
|
267
277
|
turns: List[Turn] = []
|
|
268
|
-
if self.opening_message and golden.turns is None:
|
|
269
|
-
turns.append(Turn(role="assistant", content=self.opening_message))
|
|
270
278
|
|
|
271
279
|
if golden.turns is not None:
|
|
272
280
|
turns.extend(golden.turns)
|
|
@@ -286,11 +294,7 @@ class ConversationSimulator:
|
|
|
286
294
|
if simulation_counter >= max_user_simulations:
|
|
287
295
|
update_pbar(progress, pbar_max_user_simluations_id)
|
|
288
296
|
break
|
|
289
|
-
if len(turns) == 0
|
|
290
|
-
len(turns) == 1
|
|
291
|
-
and self.opening_message
|
|
292
|
-
and golden.turns is None
|
|
293
|
-
):
|
|
297
|
+
if len(turns) == 0:
|
|
294
298
|
# Generate first user input
|
|
295
299
|
user_input = await self.a_generate_first_user_input(golden)
|
|
296
300
|
turns.append(Turn(role="user", content=user_input))
|
|
@@ -324,7 +328,7 @@ class ConversationSimulator:
|
|
|
324
328
|
turns.append(turn)
|
|
325
329
|
|
|
326
330
|
update_pbar(progress, pbar_id)
|
|
327
|
-
|
|
331
|
+
conversational_test_case = ConversationalTestCase(
|
|
328
332
|
turns=turns,
|
|
329
333
|
scenario=golden.scenario,
|
|
330
334
|
expected_outcome=golden.expected_outcome,
|
|
@@ -340,6 +344,9 @@ class ConversationSimulator:
|
|
|
340
344
|
_dataset_alias=golden._dataset_alias,
|
|
341
345
|
_dataset_id=golden._dataset_id,
|
|
342
346
|
)
|
|
347
|
+
if on_simulation_complete:
|
|
348
|
+
on_simulation_complete(conversational_test_case, index)
|
|
349
|
+
return conversational_test_case
|
|
343
350
|
|
|
344
351
|
############################################
|
|
345
352
|
### Generate User Inputs ###################
|
|
@@ -249,8 +249,16 @@ class ContextGenerator:
|
|
|
249
249
|
|
|
250
250
|
except Exception as exc:
|
|
251
251
|
# record and continue with other docs
|
|
252
|
+
show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
|
|
253
|
+
exc_info = (
|
|
254
|
+
(type(exc), exc, getattr(exc, "__traceback__", None))
|
|
255
|
+
if show_trace
|
|
256
|
+
else None
|
|
257
|
+
)
|
|
252
258
|
logger.exception(
|
|
253
|
-
"Document pipeline failed for %s",
|
|
259
|
+
"Document pipeline failed for %s",
|
|
260
|
+
path,
|
|
261
|
+
exc_info=exc_info,
|
|
254
262
|
)
|
|
255
263
|
finally:
|
|
256
264
|
# drop the collection asap to avoid too many open collections
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
deepeval/__init__.py,sha256=IqShG98ALpA1gm_qL2Jq56AJoafAHpcUTSvpgH4HpZM,3062
|
|
2
|
-
deepeval/_version.py,sha256=
|
|
2
|
+
deepeval/_version.py,sha256=WydMbWwXQz50U1Fi-HlskQWK159gVx6D6BKpIHCo2A8,27
|
|
3
3
|
deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
|
|
4
4
|
deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
|
|
5
5
|
deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
|
|
@@ -142,25 +142,25 @@ deepeval/confident/api.py,sha256=V3TaJzbc7SEf9ZZCKyiSbK7rI6hzv2D7ojD54aVrvq4,856
|
|
|
142
142
|
deepeval/confident/types.py,sha256=-slFhDof_1maMgpLxqDRZv6kz6ZVY2hP_0uj_aveJKU,533
|
|
143
143
|
deepeval/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
144
144
|
deepeval/config/logging.py,sha256=ivqmhOSB-oHOOU3MvnhImrZwkkxzxKJgoKxesnWfHjg,1314
|
|
145
|
-
deepeval/config/settings.py,sha256=
|
|
145
|
+
deepeval/config/settings.py,sha256=FT9BtRQGMYrvI5-gV3a5HaNSZi0vEWqp2xi_gl56JQU,30920
|
|
146
146
|
deepeval/config/settings_manager.py,sha256=enahSZN8krRu7-L94OBCt99fwUIqQtMRL97PlzsuKEY,4021
|
|
147
|
-
deepeval/config/utils.py,sha256=
|
|
147
|
+
deepeval/config/utils.py,sha256=bWeXj3fhGnLHIr_GuYaTp30x_PUOSAWJeS3lbZ4i3R8,3972
|
|
148
148
|
deepeval/constants.py,sha256=J5rNXGsMKTFYJ_9Wi49qchZXuUityZjnvuy3I3TO5zk,1667
|
|
149
149
|
deepeval/contextvars.py,sha256=oqXtuYiKd4Zvc1rNoR1gcRBxzZYCGTMVn7XostwvkRI,524
|
|
150
150
|
deepeval/dataset/__init__.py,sha256=N2c-rkuxWYiiJSOZArw0H02Cwo7cnfzFuNYJlvsIBEg,249
|
|
151
151
|
deepeval/dataset/api.py,sha256=ZxkEqAF4nZH_Ys_1f5r9N2LFI_vBcAJxt8eJm7Mplpw,831
|
|
152
|
-
deepeval/dataset/dataset.py,sha256=
|
|
152
|
+
deepeval/dataset/dataset.py,sha256=Nx0Nr12_AGjOOOmmAMaC6YIX62HgK8T86FtcL9IrsF4,57798
|
|
153
153
|
deepeval/dataset/golden.py,sha256=T-rTk4Hw1tANx_Iimv977F6Y4QK3s5OIB4PecU5FJDM,2338
|
|
154
154
|
deepeval/dataset/test_run_tracer.py,sha256=5CdpDvhzkEEBRyqWi6egocaxiN6IRS3XfbACxEQZQeM,2544
|
|
155
155
|
deepeval/dataset/types.py,sha256=CWeOIBPK2WdmRUqjFa9gfN-w2da0r8Ilzl3ToDpJQoQ,558
|
|
156
|
-
deepeval/dataset/utils.py,sha256=
|
|
156
|
+
deepeval/dataset/utils.py,sha256=MRiqwt-3E5WNCHtP2kY7P1PeRtFMRpGoy3r75tJ2QFg,7910
|
|
157
157
|
deepeval/errors.py,sha256=FfhtULNIQqHpKVqCr-xlvTtLxkNj40qVU89sXYKuDrA,754
|
|
158
158
|
deepeval/evaluate/__init__.py,sha256=315IaMiYEz7oJhZ4kPTBfeCNd1xF-wWVU6KOQnrKQpE,291
|
|
159
159
|
deepeval/evaluate/api.py,sha256=rkblH0ZFAAdyuF0Ymh7JE1pIJPR9yFuPrn9SQaCEQp4,435
|
|
160
160
|
deepeval/evaluate/compare.py,sha256=tdSJY4E7YJ_zO3dzvpwngZHLiUI2YQcTWJOLI83htsQ,9855
|
|
161
161
|
deepeval/evaluate/configs.py,sha256=QfWjaWNxLsgEe8-5j4PIs5WcSyEckiWt0qdpXSpl57M,928
|
|
162
162
|
deepeval/evaluate/evaluate.py,sha256=CLc-5rlHcBkJcakYXf9twaF6G8chp95gCBe8V4B-cVo,10684
|
|
163
|
-
deepeval/evaluate/execute.py,sha256=
|
|
163
|
+
deepeval/evaluate/execute.py,sha256=cXX4W4jX_Uly73WBLEduLv6Dqs32AWHGsgyp9tZmV1Q,134219
|
|
164
164
|
deepeval/evaluate/types.py,sha256=6w-0v8Rh10-qceP4iWe0yMTlee4J3immWAuqDS8LF2o,939
|
|
165
165
|
deepeval/evaluate/utils.py,sha256=RRU6TOeRBq8jTfR1rgikybsMu4t8UZiDj2yqNQLhp8w,20631
|
|
166
166
|
deepeval/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -255,7 +255,7 @@ deepeval/metrics/hallucination/__init__.py,sha256=rCVlHi2UGzDKmZKi0esFLafmshVBx2
|
|
|
255
255
|
deepeval/metrics/hallucination/hallucination.py,sha256=3iBEw-tTpU_6RB80i2WQHMRwgPtq2nsO89ceOhuUf94,10148
|
|
256
256
|
deepeval/metrics/hallucination/schema.py,sha256=V8xbrBLMwJfre-lPuDc7rMEdhHf_1hfgoW1jE_ULvAY,286
|
|
257
257
|
deepeval/metrics/hallucination/template.py,sha256=hiss1soxSBFqzOt0KmHZdZUzoQsmXnslDyb8HsjALPs,2620
|
|
258
|
-
deepeval/metrics/indicator.py,sha256=
|
|
258
|
+
deepeval/metrics/indicator.py,sha256=3Qj6eWHlyVpl5kxSozXJ6dDer64jVjZhaGD92Yse56s,11032
|
|
259
259
|
deepeval/metrics/json_correctness/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
260
260
|
deepeval/metrics/json_correctness/json_correctness.py,sha256=FmI_YSrSPhdUJ-G1txN1gD_--ACNcDWuI8yrV3OLslU,7833
|
|
261
261
|
deepeval/metrics/json_correctness/schema.py,sha256=VenN3E6_abO3IdWGdTB1ZfcFpGR9suzNkMxjr-h4I0k,94
|
|
@@ -399,7 +399,7 @@ deepeval/models/embedding_models/ollama_embedding_model.py,sha256=jW1fiAPi650WoU
|
|
|
399
399
|
deepeval/models/embedding_models/openai_embedding_model.py,sha256=XhtnWr-6uedxqI_iriTuKA2dyuB1MXylEnTkHbKVxwY,3463
|
|
400
400
|
deepeval/models/hallucination_model.py,sha256=ABi978VKLE_jNHbDzM96kJ08EsZ5ZlvOlJHA_ptSkfQ,1003
|
|
401
401
|
deepeval/models/llms/__init__.py,sha256=qmvv7wnmTDvys2uUTwQRo-_3DlFV3fGLiewPeQYRsAI,670
|
|
402
|
-
deepeval/models/llms/amazon_bedrock_model.py,sha256=
|
|
402
|
+
deepeval/models/llms/amazon_bedrock_model.py,sha256=XOethRC3h5nHieawJQryawMdwGXT1OXzoFKNMMGr9zs,5314
|
|
403
403
|
deepeval/models/llms/anthropic_model.py,sha256=5gYRNkYUD7Zl3U0SibBG2YGCQsD6DdTsaBhqdaJlKIw,6072
|
|
404
404
|
deepeval/models/llms/azure_model.py,sha256=dqINcfoJNqdd9zh5iTPwQ_ToGMOF7iH6YUB-UWRSOlc,10730
|
|
405
405
|
deepeval/models/llms/deepseek_model.py,sha256=EqBJkKa7rXppCmlnIt_D-Z_r9fbsOUsOAVvN2jWA-Hk,6404
|
|
@@ -409,13 +409,13 @@ deepeval/models/llms/kimi_model.py,sha256=ldTefdSVitZYJJQ-_ZsP87iiT5iZ4QCVdfi-Yz
|
|
|
409
409
|
deepeval/models/llms/litellm_model.py,sha256=iu4-_JCpd9LdEa-eCWseD2iLTA-r7OSgYGWQ0IxB4eA,11527
|
|
410
410
|
deepeval/models/llms/local_model.py,sha256=hEyKVA6pkQm9dICUKsMNgjVI3w6gnyMdmBt_EylkWDk,4473
|
|
411
411
|
deepeval/models/llms/ollama_model.py,sha256=xPO4d4jMY-cQAyHAcMuFvWS8JMWwCUbKP9CMi838Nuc,3307
|
|
412
|
-
deepeval/models/llms/openai_model.py,sha256=
|
|
412
|
+
deepeval/models/llms/openai_model.py,sha256=1rjwbyt87fK03pw7r5tq3PjUVfl2EWllAssGyy6Dt2A,17494
|
|
413
413
|
deepeval/models/llms/utils.py,sha256=gFM_8eIvdSwN_D4Yqp-j7PkfoiRn_bgu7tlCHol3A6c,1324
|
|
414
414
|
deepeval/models/mlllms/__init__.py,sha256=19nN6kUB5XI0nUWUQX0aD9GBUMM8WWGvsDgKjuT4EF4,144
|
|
415
415
|
deepeval/models/mlllms/gemini_model.py,sha256=7tHIWD4w_fBz3L7jkKWygn1QpBPk9nl2Kw-yb0Jc3PI,10167
|
|
416
416
|
deepeval/models/mlllms/ollama_model.py,sha256=_YtYtw8oIMVVI-CFsDicsdeEJUPhw_9ArPxB_1olsJA,4798
|
|
417
417
|
deepeval/models/mlllms/openai_model.py,sha256=KgvYgQwWZ1A_Gcl6-4-W7IMqbUF9K8sNY37j5Ag7kQQ,9014
|
|
418
|
-
deepeval/models/retry_policy.py,sha256=
|
|
418
|
+
deepeval/models/retry_policy.py,sha256=sCluLDM0kshOhGwQzeA80dDXXc2Oc8TXyWMCG1kEUGE,34384
|
|
419
419
|
deepeval/models/summac_model.py,sha256=wKeH7pWQRXrTlzlIw_r1YCb8b7jUhWq6jUz9FiNUCSg,1992
|
|
420
420
|
deepeval/models/unbias_model.py,sha256=umOMhQLTmnD7uOuhiQufEl4Wlti4q2s3EtKOpds7zhs,597
|
|
421
421
|
deepeval/models/utils.py,sha256=-3XDgg1U7PZ0jpLFiYXxqdBhp7idvlo7RPZv5SoD8lc,1130
|
|
@@ -442,14 +442,14 @@ deepeval/red_teaming/README.md,sha256=BY5rAdpp3-sMMToEKwq0Nsd9ivkGDzPE16DeDb8GY7
|
|
|
442
442
|
deepeval/scorer/__init__.py,sha256=hTvtoV3a4l0dSBjERm-jX7jveTtKZXK0c9JerQo0T_w,27
|
|
443
443
|
deepeval/scorer/scorer.py,sha256=EmXo1wEMMAL2it8WxNJ4cTqZLCH1ad4BY2VewoX6b10,18348
|
|
444
444
|
deepeval/simulator/__init__.py,sha256=wkyevg9nh46rsVnVrBjY3K5bHlkqjwx4TtrTfyjDCO0,96
|
|
445
|
-
deepeval/simulator/conversation_simulator.py,sha256=
|
|
445
|
+
deepeval/simulator/conversation_simulator.py,sha256=DhpxlarJbf-CB-kID0pwN1AiWKzjtk4sEskcIddBst0,24639
|
|
446
446
|
deepeval/simulator/schema.py,sha256=16X2-m92plP52YTd-dvECt_-6gsz0U4j7Ut3UdI6gKY,252
|
|
447
447
|
deepeval/simulator/template.py,sha256=5QLrxvDjHJpcudg-rGeLaUKGCDdHf_2QhlYBL9FJKxE,5869
|
|
448
448
|
deepeval/singleton.py,sha256=irNbt0-IRI7rD4t05OZHsrNovpeva0XPc8PoieFytG8,532
|
|
449
449
|
deepeval/synthesizer/__init__.py,sha256=eMihIQH-NClWL-8CZ9HUs9GA1Cy_DV_YX_f1o4bpO2c,142
|
|
450
450
|
deepeval/synthesizer/base_synthesizer.py,sha256=ua7HDq1lemeFH5FuWdGJxMGFH5QA2CSYkekk4bpOHcU,675
|
|
451
451
|
deepeval/synthesizer/chunking/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
452
|
-
deepeval/synthesizer/chunking/context_generator.py,sha256=
|
|
452
|
+
deepeval/synthesizer/chunking/context_generator.py,sha256=ucwa-7BcPSHHf4Tu31dfyJ63Bg9HdNhGLAW5_QsfFX4,38052
|
|
453
453
|
deepeval/synthesizer/chunking/doc_chunker.py,sha256=DvJmbA_NnZBGCZgxKQsxlIC29kca_d7J-Dxq9SyfzX0,10612
|
|
454
454
|
deepeval/synthesizer/config.py,sha256=vcSi6upnmd667dAGANTTdPmY0z5sQ8Ctal7Xr4-tbhA,1934
|
|
455
455
|
deepeval/synthesizer/schema.py,sha256=PIv3012VMg_v-Ylwn08-4tNjf4QShBSg-kaCkgtdA88,879
|
|
@@ -495,8 +495,8 @@ deepeval/tracing/tracing.py,sha256=iMpi5Q8e1VYuTLshzWEHuZC7ysuNFYwRCVBn1jX-qUg,4
|
|
|
495
495
|
deepeval/tracing/types.py,sha256=QZdINiuLCT2oqaCoiaClOtL6m3Wzi8mN-ajZd09XHqw,5948
|
|
496
496
|
deepeval/tracing/utils.py,sha256=pW_UJMzyeR1PWYZuZrPeVGGYiL6FRpFE3Ehbz9uhw_M,5525
|
|
497
497
|
deepeval/utils.py,sha256=g4vO7E6xvdKtp1x84NXXX5Ifm3c0OMIA0IaiP2U5a-s,22079
|
|
498
|
-
deepeval-3.6.
|
|
499
|
-
deepeval-3.6.
|
|
500
|
-
deepeval-3.6.
|
|
501
|
-
deepeval-3.6.
|
|
502
|
-
deepeval-3.6.
|
|
498
|
+
deepeval-3.6.9.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
|
|
499
|
+
deepeval-3.6.9.dist-info/METADATA,sha256=u-OgJTHDKHRn88hnnDfdkT_swHC4tJRc3S3crwFIiss,18754
|
|
500
|
+
deepeval-3.6.9.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
501
|
+
deepeval-3.6.9.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
|
|
502
|
+
deepeval-3.6.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|