deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +258 -47
- deepeval/config/settings_manager.py +4 -0
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/evaluate/execute.py +1099 -633
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +5 -4
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +4 -3
- deepeval/test_run/test_run.py +24 -5
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +10 -1
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +44 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -76,23 +76,26 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
76
76
|
async def a_generate(
|
|
77
77
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
78
78
|
) -> Tuple[Union[str, Dict], float]:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
79
|
+
try:
|
|
80
|
+
payload = self.get_converse_request_body(prompt)
|
|
81
|
+
client = await self._ensure_client()
|
|
82
|
+
response = await client.converse(
|
|
83
|
+
modelId=self.model_id,
|
|
84
|
+
messages=payload["messages"],
|
|
85
|
+
inferenceConfig=payload["inferenceConfig"],
|
|
86
|
+
)
|
|
87
|
+
message = response["output"]["message"]["content"][0]["text"]
|
|
88
|
+
cost = self.calculate_cost(
|
|
89
|
+
response["usage"]["inputTokens"],
|
|
90
|
+
response["usage"]["outputTokens"],
|
|
91
|
+
)
|
|
92
|
+
if schema is None:
|
|
93
|
+
return message, cost
|
|
94
|
+
else:
|
|
95
|
+
json_output = trim_and_load_json(message)
|
|
96
|
+
return schema.model_validate(json_output), cost
|
|
97
|
+
finally:
|
|
98
|
+
await self.close()
|
|
96
99
|
|
|
97
100
|
###############################################
|
|
98
101
|
# Client management
|
|
@@ -8,6 +8,7 @@ from openai import (
|
|
|
8
8
|
AsyncOpenAI,
|
|
9
9
|
)
|
|
10
10
|
|
|
11
|
+
from deepeval.config.settings import get_settings
|
|
11
12
|
from deepeval.constants import ProviderSlug as PS
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
@@ -209,6 +210,11 @@ models_requiring_temperature_1 = [
|
|
|
209
210
|
]
|
|
210
211
|
|
|
211
212
|
|
|
213
|
+
def _request_timeout_seconds() -> float:
|
|
214
|
+
timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
215
|
+
return timeout if timeout > 0 else 30.0
|
|
216
|
+
|
|
217
|
+
|
|
212
218
|
class GPTModel(DeepEvalBaseLLM):
|
|
213
219
|
def __init__(
|
|
214
220
|
self,
|
|
@@ -387,7 +393,6 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
387
393
|
)
|
|
388
394
|
return schema.model_validate(json_output), cost
|
|
389
395
|
|
|
390
|
-
client: AsyncOpenAI
|
|
391
396
|
completion = await client.chat.completions.create(
|
|
392
397
|
model=self.model_name,
|
|
393
398
|
messages=[{"role": "user", "content": prompt}],
|
|
@@ -501,9 +506,13 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
501
506
|
kwargs = dict(self.kwargs or {})
|
|
502
507
|
if not sdk_retries_for(PS.OPENAI):
|
|
503
508
|
kwargs["max_retries"] = 0
|
|
509
|
+
|
|
510
|
+
if not kwargs.get("timeout"):
|
|
511
|
+
kwargs["timeout"] = _request_timeout_seconds()
|
|
504
512
|
return kwargs
|
|
505
513
|
|
|
506
514
|
def _build_client(self, cls):
|
|
515
|
+
|
|
507
516
|
kw = dict(
|
|
508
517
|
api_key=self._openai_api_key,
|
|
509
518
|
base_url=self.base_url,
|
deepeval/models/retry_policy.py
CHANGED
|
@@ -39,6 +39,7 @@ import itertools
|
|
|
39
39
|
import functools
|
|
40
40
|
import threading
|
|
41
41
|
import logging
|
|
42
|
+
import time
|
|
42
43
|
|
|
43
44
|
from dataclasses import dataclass, field
|
|
44
45
|
from typing import Callable, Iterable, Mapping, Optional, Sequence, Tuple, Union
|
|
@@ -52,6 +53,7 @@ from tenacity import (
|
|
|
52
53
|
)
|
|
53
54
|
from tenacity.stop import stop_base
|
|
54
55
|
from tenacity.wait import wait_base
|
|
56
|
+
from contextvars import ContextVar, copy_context
|
|
55
57
|
|
|
56
58
|
from deepeval.constants import (
|
|
57
59
|
ProviderSlug as PS,
|
|
@@ -65,6 +67,81 @@ Provider = Union[str, PS]
|
|
|
65
67
|
_MAX_TIMEOUT_THREADS = get_settings().DEEPEVAL_TIMEOUT_THREAD_LIMIT
|
|
66
68
|
_TIMEOUT_SEMA = threading.BoundedSemaphore(_MAX_TIMEOUT_THREADS)
|
|
67
69
|
_WORKER_ID = itertools.count(1)
|
|
70
|
+
_OUTER_DEADLINE = ContextVar("deepeval_outer_deadline", default=None)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def set_outer_deadline(seconds: float | None):
|
|
74
|
+
"""Set (or clear) the outer task time budget.
|
|
75
|
+
|
|
76
|
+
Stores a deadline in a local context variable so nested code
|
|
77
|
+
can cooperatively respect a shared budget. Always pair this with
|
|
78
|
+
`reset_outer_deadline(token)` in a `finally` block.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
seconds: Number of seconds from now to set as the deadline. If `None`,
|
|
82
|
+
`0`, or a non-positive value is provided, the deadline is cleared.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
contextvars.Token: The token returned by the underlying ContextVar `.set()`
|
|
86
|
+
call, which must be passed to `reset_outer_deadline` to restore the
|
|
87
|
+
previous value.
|
|
88
|
+
"""
|
|
89
|
+
if seconds and seconds > 0:
|
|
90
|
+
return _OUTER_DEADLINE.set(time.monotonic() + seconds)
|
|
91
|
+
return _OUTER_DEADLINE.set(None)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def reset_outer_deadline(token):
|
|
95
|
+
"""Restore the previous outer deadline set by `set_outer_deadline`.
|
|
96
|
+
|
|
97
|
+
This should be called in a `finally` block to ensure the deadline
|
|
98
|
+
is restored even if an exception occurs.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
token: The `contextvars.Token` returned by `set_outer_deadline`.
|
|
102
|
+
"""
|
|
103
|
+
if token is not None:
|
|
104
|
+
_OUTER_DEADLINE.reset(token)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _remaining_budget() -> float | None:
|
|
108
|
+
dl = _OUTER_DEADLINE.get()
|
|
109
|
+
if dl is None:
|
|
110
|
+
return None
|
|
111
|
+
return max(0.0, dl - time.monotonic())
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _is_budget_spent() -> bool:
|
|
115
|
+
rem = _remaining_budget()
|
|
116
|
+
return rem is not None and rem <= 0.0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def resolve_effective_attempt_timeout():
|
|
120
|
+
"""Resolve the timeout to use for a single provider attempt.
|
|
121
|
+
|
|
122
|
+
Combines the configured per-attempt timeout with any remaining outer budget:
|
|
123
|
+
- If `DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS` is `0` or `None`, returns `0`
|
|
124
|
+
callers should skip `asyncio.wait_for` in this case and rely on the outer cap.
|
|
125
|
+
- If positive and an outer deadline is present, returns
|
|
126
|
+
`min(per_attempt, remaining_budget)`.
|
|
127
|
+
- If positive and no outer deadline is present, returns `per_attempt`.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
float: Seconds to use for the inner per-attempt timeout. `0` means
|
|
131
|
+
disable inner timeout and rely on the outer budget instead.
|
|
132
|
+
"""
|
|
133
|
+
per_attempt = float(
|
|
134
|
+
get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0
|
|
135
|
+
)
|
|
136
|
+
# 0 or None disable inner wait_for. That means rely on outer task cap for timeouts instead.
|
|
137
|
+
if per_attempt <= 0:
|
|
138
|
+
return 0
|
|
139
|
+
# If we do have a positive per-attempt, use up to remaining outer budget.
|
|
140
|
+
rem = _remaining_budget()
|
|
141
|
+
if rem is not None:
|
|
142
|
+
return max(0.0, min(per_attempt, rem))
|
|
143
|
+
return per_attempt
|
|
144
|
+
|
|
68
145
|
|
|
69
146
|
# --------------------------
|
|
70
147
|
# Policy description
|
|
@@ -399,9 +476,10 @@ def make_after_log(slug: str):
|
|
|
399
476
|
if not _logger.isEnabledFor(after_level):
|
|
400
477
|
return
|
|
401
478
|
|
|
479
|
+
show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
|
|
402
480
|
exc_info = (
|
|
403
481
|
(type(exc), exc, getattr(exc, "__traceback__", None))
|
|
404
|
-
if
|
|
482
|
+
if show_trace
|
|
405
483
|
else None
|
|
406
484
|
)
|
|
407
485
|
|
|
@@ -416,7 +494,7 @@ def make_after_log(slug: str):
|
|
|
416
494
|
return _after
|
|
417
495
|
|
|
418
496
|
|
|
419
|
-
def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
|
|
497
|
+
def _make_timeout_error(timeout_seconds: float) -> asyncio.TimeoutError:
|
|
420
498
|
settings = get_settings()
|
|
421
499
|
if logger.isEnabledFor(logging.DEBUG):
|
|
422
500
|
logger.debug(
|
|
@@ -427,12 +505,12 @@ def _make_timeout_error(timeout_seconds: float) -> TimeoutError:
|
|
|
427
505
|
)
|
|
428
506
|
msg = (
|
|
429
507
|
f"call timed out after {timeout_seconds:g}s (per attempt). "
|
|
430
|
-
"Increase
|
|
508
|
+
"Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE (None disables) or reduce work per attempt."
|
|
431
509
|
)
|
|
432
|
-
return TimeoutError(msg)
|
|
510
|
+
return asyncio.TimeoutError(msg)
|
|
433
511
|
|
|
434
512
|
|
|
435
|
-
def
|
|
513
|
+
def run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
436
514
|
"""
|
|
437
515
|
Run a synchronous callable with a soft timeout enforced by a helper thread,
|
|
438
516
|
with a global cap on concurrent timeout-workers.
|
|
@@ -499,9 +577,11 @@ def _run_sync_with_timeout(func, timeout_seconds, *args, **kwargs):
|
|
|
499
577
|
done = threading.Event()
|
|
500
578
|
result = {"value": None, "exc": None}
|
|
501
579
|
|
|
580
|
+
context = copy_context()
|
|
581
|
+
|
|
502
582
|
def target():
|
|
503
583
|
try:
|
|
504
|
-
result["value"] = func
|
|
584
|
+
result["value"] = context.run(func, *args, **kwargs)
|
|
505
585
|
except BaseException as e:
|
|
506
586
|
result["exc"] = e
|
|
507
587
|
finally:
|
|
@@ -562,37 +642,40 @@ def create_retry_decorator(provider: Provider):
|
|
|
562
642
|
|
|
563
643
|
@functools.wraps(func)
|
|
564
644
|
async def attempt(*args, **kwargs):
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
645
|
+
if _is_budget_spent():
|
|
646
|
+
raise _make_timeout_error(0)
|
|
647
|
+
|
|
648
|
+
per_attempt_timeout = resolve_effective_attempt_timeout()
|
|
649
|
+
|
|
568
650
|
coro = func(*args, **kwargs)
|
|
569
|
-
if
|
|
651
|
+
if per_attempt_timeout > 0:
|
|
570
652
|
try:
|
|
571
|
-
return await asyncio.wait_for(coro,
|
|
572
|
-
except asyncio.TimeoutError as e:
|
|
653
|
+
return await asyncio.wait_for(coro, per_attempt_timeout)
|
|
654
|
+
except (asyncio.TimeoutError, TimeoutError) as e:
|
|
573
655
|
if (
|
|
574
656
|
logger.isEnabledFor(logging.DEBUG)
|
|
575
657
|
and get_settings().DEEPEVAL_VERBOSE_MODE is True
|
|
576
658
|
):
|
|
577
659
|
logger.debug(
|
|
578
660
|
"async timeout after %.3fs (active_threads=%d, tasks=%d)",
|
|
579
|
-
|
|
661
|
+
per_attempt_timeout,
|
|
580
662
|
threading.active_count(),
|
|
581
663
|
len(asyncio.all_tasks()),
|
|
582
664
|
)
|
|
583
|
-
raise _make_timeout_error(
|
|
665
|
+
raise _make_timeout_error(per_attempt_timeout) from e
|
|
584
666
|
return await coro
|
|
585
667
|
|
|
586
668
|
return base_retry(attempt)
|
|
587
669
|
|
|
588
670
|
@functools.wraps(func)
|
|
589
671
|
def attempt(*args, **kwargs):
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
672
|
+
if _is_budget_spent():
|
|
673
|
+
raise _make_timeout_error(0)
|
|
674
|
+
|
|
675
|
+
per_attempt_timeout = resolve_effective_attempt_timeout()
|
|
676
|
+
if per_attempt_timeout > 0:
|
|
677
|
+
return run_sync_with_timeout(
|
|
678
|
+
func, per_attempt_timeout, *args, **kwargs
|
|
596
679
|
)
|
|
597
680
|
return func(*args, **kwargs)
|
|
598
681
|
|
deepeval/openai/__init__.py
CHANGED
|
@@ -15,5 +15,7 @@ except ImportError:
|
|
|
15
15
|
|
|
16
16
|
if OpenAI or AsyncOpenAI:
|
|
17
17
|
from deepeval.openai.patch import patch_openai_classes
|
|
18
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
with capture_tracing_integration("openai"):
|
|
21
|
+
patch_openai_classes()
|
deepeval/openai/extractors.py
CHANGED
|
@@ -4,13 +4,13 @@ from typing import Any, Union, Dict
|
|
|
4
4
|
from openai.types.responses import Response
|
|
5
5
|
|
|
6
6
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
7
|
+
|
|
8
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
7
9
|
from deepeval.openai.utils import (
|
|
8
10
|
render_response_input,
|
|
9
11
|
stringify_multimodal_content,
|
|
10
12
|
render_messages,
|
|
11
13
|
)
|
|
12
|
-
from deepeval.openai.types import InputParameters, OutputParameters
|
|
13
|
-
from deepeval.tracing.types import Message
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
# guarding against errors to be compatible with legacy APIs
|
deepeval/openai/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
|
-
from typing import Any, Dict, List,
|
|
3
|
+
from typing import Any, Dict, List, Iterable
|
|
4
4
|
|
|
5
5
|
from openai.types.chat.chat_completion_message_param import (
|
|
6
6
|
ChatCompletionMessageParam,
|
|
@@ -8,32 +8,8 @@ from openai.types.chat.chat_completion_message_param import (
|
|
|
8
8
|
|
|
9
9
|
from deepeval.tracing.types import ToolSpan, TraceSpanStatus
|
|
10
10
|
from deepeval.tracing.context import current_span_context
|
|
11
|
-
from deepeval.
|
|
12
|
-
from deepeval.
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
_URL_MAX = 200
|
|
16
|
-
_JSON_MAX = max(
|
|
17
|
-
len_long(), 400
|
|
18
|
-
) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def _compact_dump(value: Any) -> str:
|
|
22
|
-
try:
|
|
23
|
-
dumped = json.dumps(
|
|
24
|
-
value, ensure_ascii=False, default=str, separators=(",", ":")
|
|
25
|
-
)
|
|
26
|
-
except Exception:
|
|
27
|
-
dumped = repr(value)
|
|
28
|
-
return shorten(dumped, max_len=_JSON_MAX)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def _fmt_url(url: Optional[str]) -> str:
|
|
32
|
-
if not url:
|
|
33
|
-
return ""
|
|
34
|
-
if url.startswith("data:"):
|
|
35
|
-
return "[data-uri]"
|
|
36
|
-
return shorten(url, max_len=_URL_MAX)
|
|
11
|
+
from deepeval.model_integrations.types import OutputParameters
|
|
12
|
+
from deepeval.model_integrations.utils import compact_dump, fmt_url
|
|
37
13
|
|
|
38
14
|
|
|
39
15
|
def create_child_tool_spans(output_parameters: OutputParameters):
|
|
@@ -111,7 +87,7 @@ def stringify_multimodal_content(content: Any) -> str:
|
|
|
111
87
|
url = image_url
|
|
112
88
|
else:
|
|
113
89
|
url = (image_url or {}).get("url") or content.get("url")
|
|
114
|
-
return f"[image:{
|
|
90
|
+
return f"[image:{fmt_url(url)}]"
|
|
115
91
|
|
|
116
92
|
# Responses API variants
|
|
117
93
|
if t == "input_text":
|
|
@@ -122,14 +98,14 @@ def stringify_multimodal_content(content: Any) -> str:
|
|
|
122
98
|
url = image_url
|
|
123
99
|
else:
|
|
124
100
|
url = (image_url or {}).get("url") or content.get("url")
|
|
125
|
-
return f"[image:{
|
|
101
|
+
return f"[image:{fmt_url(url)}]"
|
|
126
102
|
|
|
127
103
|
# readability for other input_* types we don't currently handle
|
|
128
104
|
if t and t.startswith("input_"):
|
|
129
105
|
return f"[{t}]"
|
|
130
106
|
|
|
131
107
|
# unknown dicts and types returned as shortened JSON
|
|
132
|
-
return
|
|
108
|
+
return compact_dump(content)
|
|
133
109
|
|
|
134
110
|
|
|
135
111
|
def render_messages(
|
|
@@ -228,7 +204,7 @@ def _render_content(content: Dict[str, Any], indent: int = 0) -> str:
|
|
|
228
204
|
lines.append(f"{prefix}{key}:")
|
|
229
205
|
lines.append(_render_content(value, indent + 1))
|
|
230
206
|
elif isinstance(value, list):
|
|
231
|
-
lines.append(f"{prefix}{key}: {
|
|
207
|
+
lines.append(f"{prefix}{key}: {compact_dump(value)}")
|
|
232
208
|
else:
|
|
233
209
|
lines.append(f"{prefix}{key}: {value}")
|
|
234
210
|
|
deepeval/prompt/api.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field, AliasChoices
|
|
1
|
+
from pydantic import BaseModel, Field, AliasChoices, ConfigDict
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
from pydantic import TypeAdapter
|
|
5
5
|
|
|
6
|
+
from deepeval.utils import make_model_config
|
|
7
|
+
|
|
6
8
|
###################################
|
|
7
9
|
# Model Settings
|
|
8
10
|
###################################
|
|
@@ -92,6 +94,8 @@ class SchemaDataType(Enum):
|
|
|
92
94
|
|
|
93
95
|
|
|
94
96
|
class OutputSchemaField(BaseModel):
|
|
97
|
+
model_config = make_model_config(use_enum_values=True)
|
|
98
|
+
|
|
95
99
|
id: str
|
|
96
100
|
type: SchemaDataType
|
|
97
101
|
name: str
|
|
@@ -102,9 +106,6 @@ class OutputSchemaField(BaseModel):
|
|
|
102
106
|
validation_alias=AliasChoices("parent_id", "parentId"),
|
|
103
107
|
)
|
|
104
108
|
|
|
105
|
-
class Config:
|
|
106
|
-
use_enum_values = True
|
|
107
|
-
|
|
108
109
|
|
|
109
110
|
class OutputSchema(BaseModel):
|
|
110
111
|
fields: Optional[List[OutputSchemaField]] = None
|
|
@@ -187,6 +188,10 @@ class PromptHttpResponse(BaseModel):
|
|
|
187
188
|
|
|
188
189
|
|
|
189
190
|
class PromptPushRequest(BaseModel):
|
|
191
|
+
model_config = make_model_config(use_enum_values=True)
|
|
192
|
+
|
|
193
|
+
model_config = ConfigDict(use_enum_values=True)
|
|
194
|
+
|
|
190
195
|
alias: str
|
|
191
196
|
text: Optional[str] = None
|
|
192
197
|
messages: Optional[List[PromptMessage]] = None
|
|
@@ -203,11 +208,10 @@ class PromptPushRequest(BaseModel):
|
|
|
203
208
|
default=None, serialization_alias="outputType"
|
|
204
209
|
)
|
|
205
210
|
|
|
206
|
-
class Config:
|
|
207
|
-
use_enum_values = True
|
|
208
|
-
|
|
209
211
|
|
|
210
212
|
class PromptUpdateRequest(BaseModel):
|
|
213
|
+
model_config = make_model_config(use_enum_values=True)
|
|
214
|
+
|
|
211
215
|
text: Optional[str] = None
|
|
212
216
|
messages: Optional[List[PromptMessage]] = None
|
|
213
217
|
interpolation_type: PromptInterpolationType = Field(
|
|
@@ -223,9 +227,6 @@ class PromptUpdateRequest(BaseModel):
|
|
|
223
227
|
default=None, serialization_alias="outputType"
|
|
224
228
|
)
|
|
225
229
|
|
|
226
|
-
class Config:
|
|
227
|
-
use_enum_values = True
|
|
228
|
-
|
|
229
230
|
|
|
230
231
|
class PromptApi(BaseModel):
|
|
231
232
|
id: str
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -5,11 +5,13 @@ from rich.console import Console
|
|
|
5
5
|
import time
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
|
-
from pydantic import BaseModel, ValidationError
|
|
8
|
+
from pydantic import BaseModel, ValidationError, ConfigDict
|
|
9
9
|
import asyncio
|
|
10
10
|
import portalocker
|
|
11
11
|
import threading
|
|
12
12
|
|
|
13
|
+
from deepeval.utils import make_model_config
|
|
14
|
+
|
|
13
15
|
from deepeval.prompt.api import (
|
|
14
16
|
PromptHttpResponse,
|
|
15
17
|
PromptMessage,
|
|
@@ -77,6 +79,8 @@ class CustomEncoder(json.JSONEncoder):
|
|
|
77
79
|
|
|
78
80
|
|
|
79
81
|
class CachedPrompt(BaseModel):
|
|
82
|
+
model_config = make_model_config(use_enum_values=True)
|
|
83
|
+
|
|
80
84
|
alias: str
|
|
81
85
|
version: str
|
|
82
86
|
label: Optional[str] = None
|
|
@@ -89,9 +93,6 @@ class CachedPrompt(BaseModel):
|
|
|
89
93
|
output_type: Optional[OutputType]
|
|
90
94
|
output_schema: Optional[OutputSchema]
|
|
91
95
|
|
|
92
|
-
class Config:
|
|
93
|
-
use_enum_values = True
|
|
94
|
-
|
|
95
96
|
|
|
96
97
|
class Prompt:
|
|
97
98
|
|
|
@@ -35,7 +35,6 @@ class ConversationSimulator:
|
|
|
35
35
|
self,
|
|
36
36
|
model_callback: Callable[[str], str],
|
|
37
37
|
simulator_model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
38
|
-
opening_message: Optional[str] = None,
|
|
39
38
|
max_concurrent: int = 5,
|
|
40
39
|
async_mode: bool = True,
|
|
41
40
|
language: str = "English",
|
|
@@ -45,7 +44,6 @@ class ConversationSimulator:
|
|
|
45
44
|
self.is_callback_async = inspect.iscoroutinefunction(
|
|
46
45
|
self.model_callback
|
|
47
46
|
)
|
|
48
|
-
self.opening_message = opening_message
|
|
49
47
|
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
50
48
|
self.async_mode = async_mode
|
|
51
49
|
self.language = language
|
|
@@ -68,6 +66,9 @@ class ConversationSimulator:
|
|
|
68
66
|
self,
|
|
69
67
|
conversational_goldens: List[ConversationalGolden],
|
|
70
68
|
max_user_simulations: int = 10,
|
|
69
|
+
on_simulation_complete: Optional[
|
|
70
|
+
Callable[[ConversationalTestCase, int], None]
|
|
71
|
+
] = None,
|
|
71
72
|
) -> List[ConversationalTestCase]:
|
|
72
73
|
self.simulation_cost = 0 if self.using_native_model else None
|
|
73
74
|
|
|
@@ -87,6 +88,7 @@ class ConversationSimulator:
|
|
|
87
88
|
self._a_simulate(
|
|
88
89
|
conversational_goldens=conversational_goldens,
|
|
89
90
|
max_user_simulations=max_user_simulations,
|
|
91
|
+
on_simulation_complete=on_simulation_complete,
|
|
90
92
|
progress=progress,
|
|
91
93
|
pbar_id=pbar_id,
|
|
92
94
|
)
|
|
@@ -103,6 +105,7 @@ class ConversationSimulator:
|
|
|
103
105
|
index=conversation_index,
|
|
104
106
|
progress=progress,
|
|
105
107
|
pbar_id=pbar_id,
|
|
108
|
+
on_simulation_complete=on_simulation_complete,
|
|
106
109
|
)
|
|
107
110
|
)
|
|
108
111
|
conversational_test_cases.append(conversational_test_case)
|
|
@@ -115,6 +118,9 @@ class ConversationSimulator:
|
|
|
115
118
|
self,
|
|
116
119
|
conversational_goldens: List[ConversationalGolden],
|
|
117
120
|
max_user_simulations: int,
|
|
121
|
+
on_simulation_complete: Optional[
|
|
122
|
+
Callable[[ConversationalTestCase, int], None]
|
|
123
|
+
] = None,
|
|
118
124
|
progress: Optional[Progress] = None,
|
|
119
125
|
pbar_id: Optional[int] = None,
|
|
120
126
|
) -> List[ConversationalTestCase]:
|
|
@@ -131,6 +137,7 @@ class ConversationSimulator:
|
|
|
131
137
|
index=conversation_index,
|
|
132
138
|
progress=progress,
|
|
133
139
|
pbar_id=pbar_id,
|
|
140
|
+
on_simulation_complete=on_simulation_complete,
|
|
134
141
|
)
|
|
135
142
|
|
|
136
143
|
tasks = [
|
|
@@ -150,6 +157,9 @@ class ConversationSimulator:
|
|
|
150
157
|
index: int,
|
|
151
158
|
progress: Optional[Progress] = None,
|
|
152
159
|
pbar_id: Optional[int] = None,
|
|
160
|
+
on_simulation_complete: Optional[
|
|
161
|
+
Callable[[ConversationalTestCase, int], None]
|
|
162
|
+
] = None,
|
|
153
163
|
) -> ConversationalTestCase:
|
|
154
164
|
simulation_counter = 0
|
|
155
165
|
if max_user_simulations <= 0:
|
|
@@ -166,8 +176,6 @@ class ConversationSimulator:
|
|
|
166
176
|
user_input = None
|
|
167
177
|
thread_id = str(uuid.uuid4())
|
|
168
178
|
turns: List[Turn] = []
|
|
169
|
-
if self.opening_message and golden.turns is None:
|
|
170
|
-
turns.append(Turn(role="assistant", content=self.opening_message))
|
|
171
179
|
|
|
172
180
|
if golden.turns is not None:
|
|
173
181
|
turns.extend(golden.turns)
|
|
@@ -187,11 +195,7 @@ class ConversationSimulator:
|
|
|
187
195
|
if simulation_counter >= max_user_simulations:
|
|
188
196
|
update_pbar(progress, pbar_max_user_simluations_id)
|
|
189
197
|
break
|
|
190
|
-
if len(turns) == 0
|
|
191
|
-
len(turns) == 1
|
|
192
|
-
and self.opening_message
|
|
193
|
-
and golden.turns is None
|
|
194
|
-
):
|
|
198
|
+
if len(turns) == 0:
|
|
195
199
|
# Generate first user input
|
|
196
200
|
user_input = self.generate_first_user_input(golden)
|
|
197
201
|
turns.append(Turn(role="user", content=user_input))
|
|
@@ -225,7 +229,7 @@ class ConversationSimulator:
|
|
|
225
229
|
turns.append(turn)
|
|
226
230
|
|
|
227
231
|
update_pbar(progress, pbar_id)
|
|
228
|
-
|
|
232
|
+
conversational_test_case = ConversationalTestCase(
|
|
229
233
|
turns=turns,
|
|
230
234
|
scenario=golden.scenario,
|
|
231
235
|
expected_outcome=golden.expected_outcome,
|
|
@@ -241,6 +245,9 @@ class ConversationSimulator:
|
|
|
241
245
|
_dataset_alias=golden._dataset_alias,
|
|
242
246
|
_dataset_id=golden._dataset_id,
|
|
243
247
|
)
|
|
248
|
+
if on_simulation_complete:
|
|
249
|
+
on_simulation_complete(conversational_test_case, index)
|
|
250
|
+
return conversational_test_case
|
|
244
251
|
|
|
245
252
|
async def _a_simulate_single_conversation(
|
|
246
253
|
self,
|
|
@@ -249,6 +256,9 @@ class ConversationSimulator:
|
|
|
249
256
|
index: Optional[int] = None,
|
|
250
257
|
progress: Optional[Progress] = None,
|
|
251
258
|
pbar_id: Optional[int] = None,
|
|
259
|
+
on_simulation_complete: Optional[
|
|
260
|
+
Callable[[ConversationalTestCase, int], None]
|
|
261
|
+
] = None,
|
|
252
262
|
) -> ConversationalTestCase:
|
|
253
263
|
simulation_counter = 0
|
|
254
264
|
if max_user_simulations <= 0:
|
|
@@ -265,8 +275,6 @@ class ConversationSimulator:
|
|
|
265
275
|
user_input = None
|
|
266
276
|
thread_id = str(uuid.uuid4())
|
|
267
277
|
turns: List[Turn] = []
|
|
268
|
-
if self.opening_message and golden.turns is None:
|
|
269
|
-
turns.append(Turn(role="assistant", content=self.opening_message))
|
|
270
278
|
|
|
271
279
|
if golden.turns is not None:
|
|
272
280
|
turns.extend(golden.turns)
|
|
@@ -286,11 +294,7 @@ class ConversationSimulator:
|
|
|
286
294
|
if simulation_counter >= max_user_simulations:
|
|
287
295
|
update_pbar(progress, pbar_max_user_simluations_id)
|
|
288
296
|
break
|
|
289
|
-
if len(turns) == 0
|
|
290
|
-
len(turns) == 1
|
|
291
|
-
and self.opening_message
|
|
292
|
-
and golden.turns is None
|
|
293
|
-
):
|
|
297
|
+
if len(turns) == 0:
|
|
294
298
|
# Generate first user input
|
|
295
299
|
user_input = await self.a_generate_first_user_input(golden)
|
|
296
300
|
turns.append(Turn(role="user", content=user_input))
|
|
@@ -324,7 +328,7 @@ class ConversationSimulator:
|
|
|
324
328
|
turns.append(turn)
|
|
325
329
|
|
|
326
330
|
update_pbar(progress, pbar_id)
|
|
327
|
-
|
|
331
|
+
conversational_test_case = ConversationalTestCase(
|
|
328
332
|
turns=turns,
|
|
329
333
|
scenario=golden.scenario,
|
|
330
334
|
expected_outcome=golden.expected_outcome,
|
|
@@ -340,6 +344,9 @@ class ConversationSimulator:
|
|
|
340
344
|
_dataset_alias=golden._dataset_alias,
|
|
341
345
|
_dataset_id=golden._dataset_id,
|
|
342
346
|
)
|
|
347
|
+
if on_simulation_complete:
|
|
348
|
+
on_simulation_complete(conversational_test_case, index)
|
|
349
|
+
return conversational_test_case
|
|
343
350
|
|
|
344
351
|
############################################
|
|
345
352
|
### Generate User Inputs ###################
|
|
@@ -249,8 +249,16 @@ class ContextGenerator:
|
|
|
249
249
|
|
|
250
250
|
except Exception as exc:
|
|
251
251
|
# record and continue with other docs
|
|
252
|
+
show_trace = bool(get_settings().DEEPEVAL_LOG_STACK_TRACES)
|
|
253
|
+
exc_info = (
|
|
254
|
+
(type(exc), exc, getattr(exc, "__traceback__", None))
|
|
255
|
+
if show_trace
|
|
256
|
+
else None
|
|
257
|
+
)
|
|
252
258
|
logger.exception(
|
|
253
|
-
"Document pipeline failed for %s",
|
|
259
|
+
"Document pipeline failed for %s",
|
|
260
|
+
path,
|
|
261
|
+
exc_info=exc_info,
|
|
254
262
|
)
|
|
255
263
|
finally:
|
|
256
264
|
# drop the collection asap to avoid too many open collections
|