deepeval 3.4.8__py3-none-any.whl → 3.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +8 -5
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +561 -727
- deepeval/confident/api.py +29 -14
- deepeval/config/__init__.py +0 -0
- deepeval/config/settings.py +565 -0
- deepeval/config/settings_manager.py +133 -0
- deepeval/config/utils.py +86 -0
- deepeval/dataset/__init__.py +1 -0
- deepeval/dataset/dataset.py +70 -10
- deepeval/dataset/test_run_tracer.py +82 -0
- deepeval/dataset/utils.py +23 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/answer_relevancy/template.py +7 -2
- deepeval/metrics/faithfulness/template.py +11 -8
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
- deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
- deepeval/models/llms/amazon_bedrock_model.py +24 -3
- deepeval/models/llms/openai_model.py +37 -41
- deepeval/models/retry_policy.py +280 -0
- deepeval/openai_agents/agent.py +4 -2
- deepeval/test_run/api.py +1 -0
- deepeval/tracing/otel/exporter.py +20 -8
- deepeval/tracing/otel/utils.py +57 -0
- deepeval/tracing/tracing.py +37 -16
- deepeval/tracing/utils.py +98 -1
- deepeval/utils.py +111 -70
- {deepeval-3.4.8.dist-info → deepeval-3.4.9.dist-info}/METADATA +3 -1
- {deepeval-3.4.8.dist-info → deepeval-3.4.9.dist-info}/RECORD +33 -28
- deepeval/env.py +0 -35
- {deepeval-3.4.8.dist-info → deepeval-3.4.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.4.9.dist-info}/WHEEL +0 -0
- {deepeval-3.4.8.dist-info → deepeval-3.4.9.dist-info}/entry_points.txt +0 -0
|
@@ -115,13 +115,34 @@ class AmazonBedrockModel(DeepEvalBaseLLM):
|
|
|
115
115
|
###############################################
|
|
116
116
|
|
|
117
117
|
def get_converse_request_body(self, prompt: str) -> dict:
|
|
118
|
+
# Inline parameter translation with defaults
|
|
119
|
+
param_mapping = {
|
|
120
|
+
"max_tokens": "maxTokens",
|
|
121
|
+
"top_p": "topP",
|
|
122
|
+
"top_k": "topK",
|
|
123
|
+
"stop_sequences": "stopSequences",
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
# Start with defaults for required parameters
|
|
127
|
+
translated_kwargs = {
|
|
128
|
+
"maxTokens": self.generation_kwargs.get("max_tokens", 1000),
|
|
129
|
+
"topP": self.generation_kwargs.get("top_p", 0),
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Add any other parameters from generation_kwargs
|
|
133
|
+
for key, value in self.generation_kwargs.items():
|
|
134
|
+
if key not in [
|
|
135
|
+
"max_tokens",
|
|
136
|
+
"top_p",
|
|
137
|
+
]: # Skip already handled defaults
|
|
138
|
+
aws_key = param_mapping.get(key, key)
|
|
139
|
+
translated_kwargs[aws_key] = value
|
|
140
|
+
|
|
118
141
|
return {
|
|
119
142
|
"messages": [{"role": "user", "content": [{"text": prompt}]}],
|
|
120
143
|
"inferenceConfig": {
|
|
121
144
|
"temperature": self.temperature,
|
|
122
|
-
|
|
123
|
-
"maxTokens": self.generation_kwargs.get("max_tokens", 1000),
|
|
124
|
-
**self.generation_kwargs,
|
|
145
|
+
**translated_kwargs,
|
|
125
146
|
},
|
|
126
147
|
}
|
|
127
148
|
|
|
@@ -1,26 +1,33 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
1
3
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
2
4
|
from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
|
|
3
5
|
from typing import Optional, Tuple, Union, Dict
|
|
4
|
-
from openai import OpenAI, AsyncOpenAI
|
|
5
6
|
from pydantic import BaseModel
|
|
6
|
-
import logging
|
|
7
|
-
import openai
|
|
8
7
|
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
wait_exponential_jitter,
|
|
13
|
-
RetryCallState,
|
|
8
|
+
from openai import (
|
|
9
|
+
OpenAI,
|
|
10
|
+
AsyncOpenAI,
|
|
14
11
|
)
|
|
15
12
|
|
|
13
|
+
from tenacity import retry, RetryCallState, before_sleep_log
|
|
14
|
+
|
|
16
15
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
16
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
18
17
|
from deepeval.models.utils import parse_model_name
|
|
18
|
+
from deepeval.models.retry_policy import (
|
|
19
|
+
OPENAI_ERROR_POLICY,
|
|
20
|
+
default_wait,
|
|
21
|
+
default_stop,
|
|
22
|
+
retry_predicate,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("deepeval.openai_model")
|
|
19
26
|
|
|
20
27
|
|
|
21
28
|
def log_retry_error(retry_state: RetryCallState):
|
|
22
29
|
exception = retry_state.outcome.exception()
|
|
23
|
-
|
|
30
|
+
logger.error(
|
|
24
31
|
f"OpenAI Error: {exception} Retrying: {retry_state.attempt_number} time(s)..."
|
|
25
32
|
)
|
|
26
33
|
|
|
@@ -212,14 +219,22 @@ models_requiring_temperature_1 = [
|
|
|
212
219
|
"gpt-5-chat-latest",
|
|
213
220
|
]
|
|
214
221
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
222
|
+
_base_retry_rules_kw = dict(
|
|
223
|
+
wait=default_wait(),
|
|
224
|
+
stop=default_stop(),
|
|
225
|
+
retry=retry_predicate(OPENAI_ERROR_POLICY),
|
|
226
|
+
before_sleep=before_sleep_log(
|
|
227
|
+
logger, logging.INFO
|
|
228
|
+
), # <- logs only on retries
|
|
229
|
+
after=log_retry_error,
|
|
220
230
|
)
|
|
221
231
|
|
|
222
232
|
|
|
233
|
+
def _openai_client_kwargs():
|
|
234
|
+
# Avoid double-retry at SDK layer by disabling the SDK's own retries so tenacity is the single source of truth for retry logic.
|
|
235
|
+
return {"max_retries": 0}
|
|
236
|
+
|
|
237
|
+
|
|
223
238
|
class GPTModel(DeepEvalBaseLLM):
|
|
224
239
|
def __init__(
|
|
225
240
|
self,
|
|
@@ -296,11 +311,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
296
311
|
# Generate functions
|
|
297
312
|
###############################################
|
|
298
313
|
|
|
299
|
-
@retry(
|
|
300
|
-
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
|
|
301
|
-
retry=retry_if_exception_type(retryable_exceptions),
|
|
302
|
-
after=log_retry_error,
|
|
303
|
-
)
|
|
314
|
+
@retry(**_base_retry_rules_kw)
|
|
304
315
|
def generate(
|
|
305
316
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
306
317
|
) -> Tuple[Union[str, Dict], float]:
|
|
@@ -359,11 +370,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
359
370
|
else:
|
|
360
371
|
return output, cost
|
|
361
372
|
|
|
362
|
-
@retry(
|
|
363
|
-
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
|
|
364
|
-
retry=retry_if_exception_type(retryable_exceptions),
|
|
365
|
-
after=log_retry_error,
|
|
366
|
-
)
|
|
373
|
+
@retry(**_base_retry_rules_kw)
|
|
367
374
|
async def a_generate(
|
|
368
375
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
369
376
|
) -> Tuple[Union[str, BaseModel], float]:
|
|
@@ -427,11 +434,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
427
434
|
# Other generate functions
|
|
428
435
|
###############################################
|
|
429
436
|
|
|
430
|
-
@retry(
|
|
431
|
-
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
|
|
432
|
-
retry=retry_if_exception_type(retryable_exceptions),
|
|
433
|
-
after=log_retry_error,
|
|
434
|
-
)
|
|
437
|
+
@retry(**_base_retry_rules_kw)
|
|
435
438
|
def generate_raw_response(
|
|
436
439
|
self,
|
|
437
440
|
prompt: str,
|
|
@@ -454,11 +457,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
454
457
|
|
|
455
458
|
return completion, cost
|
|
456
459
|
|
|
457
|
-
@retry(
|
|
458
|
-
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
|
|
459
|
-
retry=retry_if_exception_type(retryable_exceptions),
|
|
460
|
-
after=log_retry_error,
|
|
461
|
-
)
|
|
460
|
+
@retry(**_base_retry_rules_kw)
|
|
462
461
|
async def a_generate_raw_response(
|
|
463
462
|
self,
|
|
464
463
|
prompt: str,
|
|
@@ -481,11 +480,7 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
481
480
|
|
|
482
481
|
return completion, cost
|
|
483
482
|
|
|
484
|
-
@retry(
|
|
485
|
-
wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
|
|
486
|
-
retry=retry_if_exception_type(retryable_exceptions),
|
|
487
|
-
after=log_retry_error,
|
|
488
|
-
)
|
|
483
|
+
@retry(**_base_retry_rules_kw)
|
|
489
484
|
def generate_samples(
|
|
490
485
|
self, prompt: str, n: int, temperature: float
|
|
491
486
|
) -> Tuple[list[str], float]:
|
|
@@ -518,12 +513,13 @@ class GPTModel(DeepEvalBaseLLM):
|
|
|
518
513
|
return self.model_name
|
|
519
514
|
|
|
520
515
|
def load_model(self, async_mode: bool = False):
|
|
516
|
+
kwargs = {**self.kwargs, **_openai_client_kwargs()}
|
|
521
517
|
if not async_mode:
|
|
522
518
|
return OpenAI(
|
|
523
519
|
api_key=self._openai_api_key,
|
|
524
520
|
base_url=self.base_url,
|
|
525
|
-
**
|
|
521
|
+
**kwargs,
|
|
526
522
|
)
|
|
527
523
|
return AsyncOpenAI(
|
|
528
|
-
api_key=self._openai_api_key, base_url=self.base_url, **
|
|
524
|
+
api_key=self._openai_api_key, base_url=self.base_url, **kwargs
|
|
529
525
|
)
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Generic retry policy helpers for provider SDKs.
|
|
2
|
+
|
|
3
|
+
This module lets models define *what is transient* vs *non-retryable* (permanent) failure
|
|
4
|
+
without coupling to a specific SDK. You provide an `ErrorPolicy` describing
|
|
5
|
+
exception classes and special “non-retryable” error codes, such as quota-exhausted from OpenAI,
|
|
6
|
+
and get back a Tenacity predicate suitable for `retry_if_exception`.
|
|
7
|
+
|
|
8
|
+
Typical use:
|
|
9
|
+
|
|
10
|
+
# Import dependencies
|
|
11
|
+
from tenacity import retry, before_sleep_log
|
|
12
|
+
from deepeval.models.retry_policy import (
|
|
13
|
+
OPENAI_ERROR_POLICY, default_wait, default_stop, retry_predicate
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# Define retry rule keywords
|
|
17
|
+
_retry_kw = dict(
|
|
18
|
+
wait=default_wait(),
|
|
19
|
+
stop=default_stop(),
|
|
20
|
+
retry=retry_predicate(OPENAI_ERROR_POLICY),
|
|
21
|
+
before_sleep=before_sleep_log(logger, logging.INFO), # <- Optional: logs only on retries
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Apply retry rule keywords where desired
|
|
25
|
+
@retry(**_retry_kw)
|
|
26
|
+
def call_openai(...):
|
|
27
|
+
...
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import logging
|
|
33
|
+
|
|
34
|
+
from deepeval.utils import read_env_int, read_env_float
|
|
35
|
+
from dataclasses import dataclass, field
|
|
36
|
+
from typing import Iterable, Mapping, Callable, Sequence, Tuple
|
|
37
|
+
from collections.abc import Mapping as ABCMapping
|
|
38
|
+
from tenacity import (
|
|
39
|
+
wait_exponential_jitter,
|
|
40
|
+
stop_after_attempt,
|
|
41
|
+
retry_if_exception,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
# --------------------------
|
|
48
|
+
# Policy description
|
|
49
|
+
# --------------------------
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class ErrorPolicy:
|
|
54
|
+
"""Describe exception classes & rules for retry classification.
|
|
55
|
+
|
|
56
|
+
Attributes:
|
|
57
|
+
auth_excs: Exceptions that indicate authentication/authorization problems.
|
|
58
|
+
These are treated as non-retryable.
|
|
59
|
+
rate_limit_excs: Exceptions representing rate limiting (HTTP 429).
|
|
60
|
+
network_excs: Exceptions for timeouts / connection issues (transient).
|
|
61
|
+
http_excs: Exceptions carrying an integer `status_code` (4xx, 5xx)
|
|
62
|
+
non_retryable_codes: Error “code” strings that should be considered permanent,
|
|
63
|
+
such as "insufficient_quota". Used to refine rate-limit handling.
|
|
64
|
+
retry_5xx: Whether to retry provider 5xx responses (defaults to True).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
auth_excs: Tuple[type[Exception], ...]
|
|
68
|
+
rate_limit_excs: Tuple[type[Exception], ...]
|
|
69
|
+
network_excs: Tuple[type[Exception], ...]
|
|
70
|
+
http_excs: Tuple[type[Exception], ...]
|
|
71
|
+
non_retryable_codes: frozenset[str] = field(default_factory=frozenset)
|
|
72
|
+
retry_5xx: bool = True
|
|
73
|
+
message_markers: Mapping[str, Iterable[str]] = field(default_factory=dict)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# --------------------------
|
|
77
|
+
# Extraction helpers
|
|
78
|
+
# --------------------------
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def extract_error_code(
|
|
82
|
+
e: Exception,
|
|
83
|
+
*,
|
|
84
|
+
response_attr: str = "response",
|
|
85
|
+
body_attr: str = "body",
|
|
86
|
+
code_path: Sequence[str] = ("error", "code"),
|
|
87
|
+
message_markers: Mapping[str, Iterable[str]] | None = None,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""Best effort extraction of an error 'code' for SDK compatibility.
|
|
90
|
+
|
|
91
|
+
Order of attempts:
|
|
92
|
+
1) Structured JSON via `e.response.json()` (typical HTTP error payload).
|
|
93
|
+
2) A dict stored on `e.body` (some gateways/proxies use this).
|
|
94
|
+
3) Message sniffing fallback, using `message_markers`.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
e: The exception raised by the SDK/provider client.
|
|
98
|
+
response_attr: Attribute name that holds an HTTP response object.
|
|
99
|
+
body_attr: Attribute name that may hold a parsed payload (dict).
|
|
100
|
+
code_path: Path of keys to traverse to the code (e.g., ["error", "code"]).
|
|
101
|
+
message_markers: Mapping from canonical code -> substrings to search for.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The code string if found, else "".
|
|
105
|
+
"""
|
|
106
|
+
# 1) Structured JSON in e.response.json()
|
|
107
|
+
resp = getattr(e, response_attr, None)
|
|
108
|
+
if resp is not None:
|
|
109
|
+
try:
|
|
110
|
+
cur = resp.json()
|
|
111
|
+
for k in code_path:
|
|
112
|
+
if not isinstance(cur, ABCMapping):
|
|
113
|
+
cur = {}
|
|
114
|
+
break
|
|
115
|
+
cur = cur.get(k, {})
|
|
116
|
+
if isinstance(cur, (str, int)):
|
|
117
|
+
return str(cur)
|
|
118
|
+
except Exception:
|
|
119
|
+
# response.json() can raise; ignore and fall through
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
# 2) SDK provided dict body
|
|
123
|
+
body = getattr(e, body_attr, None)
|
|
124
|
+
if isinstance(body, ABCMapping):
|
|
125
|
+
cur = body
|
|
126
|
+
for k in code_path:
|
|
127
|
+
if not isinstance(cur, ABCMapping):
|
|
128
|
+
cur = {}
|
|
129
|
+
break
|
|
130
|
+
cur = cur.get(k, {})
|
|
131
|
+
if isinstance(cur, (str, int)):
|
|
132
|
+
return str(cur)
|
|
133
|
+
|
|
134
|
+
# 3) Message sniff (hopefully this helps catch message codes that slip past the previous 2 parsers)
|
|
135
|
+
msg = str(e).lower()
|
|
136
|
+
markers = message_markers or {}
|
|
137
|
+
for code_key, needles in markers.items():
|
|
138
|
+
if any(n in msg for n in needles):
|
|
139
|
+
return code_key
|
|
140
|
+
|
|
141
|
+
return ""
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# --------------------------
|
|
145
|
+
# Predicate factory
|
|
146
|
+
# --------------------------
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def make_is_transient(
|
|
150
|
+
policy: ErrorPolicy,
|
|
151
|
+
*,
|
|
152
|
+
message_markers: Mapping[str, Iterable[str]] | None = None,
|
|
153
|
+
extra_non_retryable_codes: Iterable[str] = (),
|
|
154
|
+
) -> Callable[[Exception], bool]:
|
|
155
|
+
"""Create a Tenacity predicate: True = retry, False = surface immediately.
|
|
156
|
+
|
|
157
|
+
Semantics:
|
|
158
|
+
- Auth errors: non-retryable.
|
|
159
|
+
- Rate limit errors: retry unless the extracted code is in the non-retryable set
|
|
160
|
+
- Network/timeout errors: retry.
|
|
161
|
+
- HTTP errors with a `status_code`: retry 5xx if `policy.retry_5xx` is True.
|
|
162
|
+
- Everything else: treated as non-retryable.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
policy: An ErrorPolicy describing error classes and rules.
|
|
166
|
+
message_markers: Optional override/extension for code inference via message text.
|
|
167
|
+
extra_non_retryable_codes: Additional code strings to treat as non-retryable.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
A callable `predicate(e) -> bool` suitable for `retry_if_exception`.
|
|
171
|
+
"""
|
|
172
|
+
non_retryable = frozenset(policy.non_retryable_codes) | frozenset(
|
|
173
|
+
extra_non_retryable_codes
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _pred(e: Exception) -> bool:
|
|
177
|
+
if isinstance(e, policy.auth_excs):
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
if isinstance(e, policy.rate_limit_excs):
|
|
181
|
+
code = extract_error_code(
|
|
182
|
+
e, message_markers=(message_markers or policy.message_markers)
|
|
183
|
+
)
|
|
184
|
+
return code not in non_retryable
|
|
185
|
+
|
|
186
|
+
if isinstance(e, policy.network_excs):
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
if isinstance(e, policy.http_excs):
|
|
190
|
+
try:
|
|
191
|
+
sc = int(getattr(e, "status_code", 0))
|
|
192
|
+
except Exception:
|
|
193
|
+
sc = 0
|
|
194
|
+
return policy.retry_5xx and 500 <= sc < 600
|
|
195
|
+
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
return _pred
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# --------------------------
|
|
202
|
+
# Tenacity convenience
|
|
203
|
+
# --------------------------
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def default_wait():
|
|
207
|
+
"""Default backoff: exponential with jitter, capped.
|
|
208
|
+
Overridable via env:
|
|
209
|
+
- DEEPEVAL_RETRY_INITIAL_SECONDS (>=0)
|
|
210
|
+
- DEEPEVAL_RETRY_EXP_BASE (>=1)
|
|
211
|
+
- DEEPEVAL_RETRY_JITTER (>=0)
|
|
212
|
+
- DEEPEVAL_RETRY_CAP_SECONDS (>=0)
|
|
213
|
+
"""
|
|
214
|
+
initial = read_env_float(
|
|
215
|
+
"DEEPEVAL_RETRY_INITIAL_SECONDS", 1.0, min_value=0.0
|
|
216
|
+
)
|
|
217
|
+
exp_base = read_env_float("DEEPEVAL_RETRY_EXP_BASE", 2.0, min_value=1.0)
|
|
218
|
+
jitter = read_env_float("DEEPEVAL_RETRY_JITTER", 2.0, min_value=0.0)
|
|
219
|
+
cap = read_env_float("DEEPEVAL_RETRY_CAP_SECONDS", 5.0, min_value=0.0)
|
|
220
|
+
return wait_exponential_jitter(
|
|
221
|
+
initial=initial, exp_base=exp_base, jitter=jitter, max=cap
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def default_stop():
|
|
226
|
+
"""Default stop condition: at most N attempts (N-1 retries).
|
|
227
|
+
Overridable via env:
|
|
228
|
+
- DEEPEVAL_RETRY_MAX_ATTEMPTS (>=1)
|
|
229
|
+
"""
|
|
230
|
+
attempts = read_env_int("DEEPEVAL_RETRY_MAX_ATTEMPTS", 2, min_value=1)
|
|
231
|
+
return stop_after_attempt(attempts)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def retry_predicate(policy: ErrorPolicy, **kw):
|
|
235
|
+
"""Build a Tenacity `retry=` argument from a policy.
|
|
236
|
+
|
|
237
|
+
Example:
|
|
238
|
+
retry=retry_predicate(OPENAI_ERROR_POLICY, extra_non_retryable_codes=["some_code"])
|
|
239
|
+
"""
|
|
240
|
+
return retry_if_exception(make_is_transient(policy, **kw))
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# --------------------------
|
|
244
|
+
# Built-in policies
|
|
245
|
+
# --------------------------
|
|
246
|
+
OPENAI_MESSAGE_MARKERS: dict[str, tuple[str, ...]] = {
|
|
247
|
+
"insufficient_quota": ("insufficient_quota", "exceeded your current quota"),
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
from openai import (
|
|
252
|
+
AuthenticationError,
|
|
253
|
+
RateLimitError,
|
|
254
|
+
APIConnectionError,
|
|
255
|
+
APITimeoutError,
|
|
256
|
+
APIStatusError,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
OPENAI_ERROR_POLICY = ErrorPolicy(
|
|
260
|
+
auth_excs=(AuthenticationError,),
|
|
261
|
+
rate_limit_excs=(RateLimitError,),
|
|
262
|
+
network_excs=(APIConnectionError, APITimeoutError),
|
|
263
|
+
http_excs=(APIStatusError,),
|
|
264
|
+
non_retryable_codes=frozenset({"insufficient_quota"}),
|
|
265
|
+
message_markers=OPENAI_MESSAGE_MARKERS,
|
|
266
|
+
)
|
|
267
|
+
except Exception: # pragma: no cover - OpenAI may not be installed in some envs
|
|
268
|
+
OPENAI_ERROR_POLICY = None
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
__all__ = [
|
|
272
|
+
"ErrorPolicy",
|
|
273
|
+
"extract_error_code",
|
|
274
|
+
"make_is_transient",
|
|
275
|
+
"default_wait",
|
|
276
|
+
"default_stop",
|
|
277
|
+
"retry_predicate",
|
|
278
|
+
"OPENAI_MESSAGE_MARKERS",
|
|
279
|
+
"OPENAI_ERROR_POLICY",
|
|
280
|
+
]
|
deepeval/openai_agents/agent.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field, replace
|
|
4
|
-
from typing import Any, Optional, Awaitable, Callable
|
|
4
|
+
from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar
|
|
5
5
|
|
|
6
6
|
from deepeval.tracing import observe
|
|
7
7
|
from deepeval.prompt import Prompt
|
|
@@ -14,6 +14,8 @@ except Exception as e:
|
|
|
14
14
|
"openai-agents is required for this integration. Please install it."
|
|
15
15
|
) from e
|
|
16
16
|
|
|
17
|
+
TContext = TypeVar("TContext")
|
|
18
|
+
|
|
17
19
|
|
|
18
20
|
class _ObservedModel(Model):
|
|
19
21
|
def __init__(
|
|
@@ -153,7 +155,7 @@ class _ObservedProvider(ModelProvider):
|
|
|
153
155
|
|
|
154
156
|
|
|
155
157
|
@dataclass
|
|
156
|
-
class DeepEvalAgent(BaseAgent[
|
|
158
|
+
class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
|
|
157
159
|
"""
|
|
158
160
|
A subclass of agents.Agent that accepts `metrics` and `metric_collection`
|
|
159
161
|
and ensures the underlying model's `get_response` is wrapped with deepeval.observe.
|
deepeval/test_run/api.py
CHANGED
|
@@ -50,6 +50,7 @@ class LLMApiTestCase(BaseModel):
|
|
|
50
50
|
trace: Optional[TraceApi] = Field(None)
|
|
51
51
|
|
|
52
52
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
53
|
+
# metric_collection: Optional[str] = Field(None, alias="metricCollection")
|
|
53
54
|
|
|
54
55
|
def update_metric_data(self, metric_data: MetricData):
|
|
55
56
|
if self.metrics_data is None:
|
|
@@ -30,6 +30,7 @@ from deepeval.tracing.otel.utils import (
|
|
|
30
30
|
to_hex_string,
|
|
31
31
|
parse_string,
|
|
32
32
|
parse_list_of_strings,
|
|
33
|
+
post_test_run,
|
|
33
34
|
)
|
|
34
35
|
from deepeval.tracing import perf_epoch_bridge as peb
|
|
35
36
|
from deepeval.tracing.types import TraceAttributes
|
|
@@ -80,7 +81,8 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
80
81
|
self,
|
|
81
82
|
spans: typing.Sequence[ReadableSpan],
|
|
82
83
|
timeout_millis: int = 30000,
|
|
83
|
-
api_key: Optional[str] = None, # dynamic api key
|
|
84
|
+
api_key: Optional[str] = None, # dynamic api key,
|
|
85
|
+
_test_run_id: Optional[str] = None,
|
|
84
86
|
) -> SpanExportResult:
|
|
85
87
|
# build forest of spans
|
|
86
88
|
forest = self._build_span_forest(spans)
|
|
@@ -223,14 +225,24 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
223
225
|
trace_manager.add_span_to_trace(base_span_wrapper.base_span)
|
|
224
226
|
# no removing span because it can be parent of other spans
|
|
225
227
|
|
|
226
|
-
# safely end all active traces
|
|
228
|
+
# safely end all active traces or return them for test runs
|
|
227
229
|
active_traces_keys = list(trace_manager.active_traces.keys())
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
230
|
+
if _test_run_id:
|
|
231
|
+
traces = []
|
|
232
|
+
for trace_key in active_traces_keys:
|
|
233
|
+
set_trace_time(trace_manager.get_trace_by_uuid(trace_key))
|
|
234
|
+
trace = trace_manager.get_trace_by_uuid(trace_key)
|
|
235
|
+
if trace:
|
|
236
|
+
traces.append(trace)
|
|
237
|
+
trace_manager.clear_traces()
|
|
238
|
+
post_test_run(traces, _test_run_id)
|
|
239
|
+
return SpanExportResult.SUCCESS
|
|
240
|
+
else:
|
|
241
|
+
for trace_key in active_traces_keys:
|
|
242
|
+
set_trace_time(trace_manager.get_trace_by_uuid(trace_key))
|
|
243
|
+
trace_manager.end_trace(trace_key)
|
|
244
|
+
trace_manager.clear_traces()
|
|
245
|
+
return SpanExportResult.SUCCESS
|
|
234
246
|
|
|
235
247
|
def _convert_readable_span_to_base_span(
|
|
236
248
|
self, span: ReadableSpan
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Tuple, Any
|
|
2
2
|
from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
|
|
3
|
+
from deepeval.tracing import trace_manager, BaseSpan
|
|
3
4
|
from opentelemetry.sdk.trace.export import ReadableSpan
|
|
4
5
|
import json
|
|
5
6
|
|
|
@@ -250,3 +251,59 @@ def parse_list_of_strings(context: List[str]) -> List[str]:
|
|
|
250
251
|
else:
|
|
251
252
|
parsed_context.append(context_str)
|
|
252
253
|
return parsed_context
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
from deepeval.evaluate.utils import create_api_test_case
|
|
257
|
+
from deepeval.test_run.api import LLMApiTestCase
|
|
258
|
+
from deepeval.test_run.test_run import global_test_run_manager
|
|
259
|
+
from typing import Optional
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
|
|
263
|
+
# Accept single trace or list of traces
|
|
264
|
+
if isinstance(traces, Trace):
|
|
265
|
+
traces = [traces]
|
|
266
|
+
|
|
267
|
+
api_test_cases: List[LLMApiTestCase] = []
|
|
268
|
+
|
|
269
|
+
# Collect test cases from spans that have metric_collection
|
|
270
|
+
for trace in traces:
|
|
271
|
+
trace_api = trace_manager.create_trace_api(trace)
|
|
272
|
+
|
|
273
|
+
def dfs(span: BaseSpan):
|
|
274
|
+
if span.metric_collection:
|
|
275
|
+
llm_test_case = LLMTestCase(
|
|
276
|
+
input=str(span.input),
|
|
277
|
+
actual_output=(
|
|
278
|
+
str(span.output) if span.output is not None else None
|
|
279
|
+
),
|
|
280
|
+
expected_output=span.expected_output,
|
|
281
|
+
context=span.context,
|
|
282
|
+
retrieval_context=span.retrieval_context,
|
|
283
|
+
tools_called=span.tools_called,
|
|
284
|
+
expected_tools=span.expected_tools,
|
|
285
|
+
)
|
|
286
|
+
api_case = create_api_test_case(
|
|
287
|
+
test_case=llm_test_case,
|
|
288
|
+
trace=trace_api,
|
|
289
|
+
index=None,
|
|
290
|
+
)
|
|
291
|
+
if isinstance(api_case, LLMApiTestCase):
|
|
292
|
+
api_case.metric_collection = span.metric_collection
|
|
293
|
+
api_test_cases.append(api_case)
|
|
294
|
+
|
|
295
|
+
for child in span.children or []:
|
|
296
|
+
dfs(child)
|
|
297
|
+
|
|
298
|
+
for root in trace.root_spans:
|
|
299
|
+
dfs(root)
|
|
300
|
+
|
|
301
|
+
# Prepare and post TestRun using the global test run manager
|
|
302
|
+
test_run_manager = global_test_run_manager
|
|
303
|
+
test_run_manager.create_test_run(identifier=test_run_id)
|
|
304
|
+
test_run = test_run_manager.get_test_run()
|
|
305
|
+
|
|
306
|
+
for case in api_test_cases:
|
|
307
|
+
test_run.add_test_case(case)
|
|
308
|
+
|
|
309
|
+
# return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
|