deepeval 3.4.8__py3-none-any.whl → 3.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. deepeval/__init__.py +8 -5
  2. deepeval/_version.py +1 -1
  3. deepeval/benchmarks/drop/drop.py +2 -3
  4. deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
  5. deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
  6. deepeval/benchmarks/math_qa/math_qa.py +2 -2
  7. deepeval/benchmarks/mmlu/mmlu.py +2 -2
  8. deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
  9. deepeval/cli/main.py +561 -727
  10. deepeval/confident/api.py +30 -14
  11. deepeval/config/__init__.py +0 -0
  12. deepeval/config/settings.py +565 -0
  13. deepeval/config/settings_manager.py +133 -0
  14. deepeval/config/utils.py +86 -0
  15. deepeval/dataset/__init__.py +1 -0
  16. deepeval/dataset/dataset.py +70 -10
  17. deepeval/dataset/test_run_tracer.py +82 -0
  18. deepeval/dataset/utils.py +23 -0
  19. deepeval/integrations/pydantic_ai/__init__.py +2 -4
  20. deepeval/integrations/pydantic_ai/{setup.py → otel.py} +0 -8
  21. deepeval/integrations/pydantic_ai/patcher.py +376 -0
  22. deepeval/key_handler.py +1 -0
  23. deepeval/metrics/answer_relevancy/template.py +7 -2
  24. deepeval/metrics/faithfulness/template.py +11 -8
  25. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +6 -4
  26. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +6 -4
  27. deepeval/metrics/tool_correctness/tool_correctness.py +7 -3
  28. deepeval/models/llms/amazon_bedrock_model.py +24 -3
  29. deepeval/models/llms/grok_model.py +1 -1
  30. deepeval/models/llms/kimi_model.py +1 -1
  31. deepeval/models/llms/openai_model.py +37 -41
  32. deepeval/models/retry_policy.py +280 -0
  33. deepeval/openai_agents/agent.py +4 -2
  34. deepeval/test_run/api.py +1 -0
  35. deepeval/tracing/otel/exporter.py +20 -8
  36. deepeval/tracing/otel/utils.py +57 -0
  37. deepeval/tracing/perf_epoch_bridge.py +4 -4
  38. deepeval/tracing/tracing.py +37 -16
  39. deepeval/tracing/utils.py +98 -1
  40. deepeval/utils.py +111 -70
  41. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/METADATA +16 -13
  42. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/RECORD +45 -40
  43. deepeval/env.py +0 -35
  44. deepeval/integrations/pydantic_ai/agent.py +0 -364
  45. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/LICENSE.md +0 -0
  46. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/WHEEL +0 -0
  47. {deepeval-3.4.8.dist-info → deepeval-3.5.0.dist-info}/entry_points.txt +0 -0
@@ -1,26 +1,33 @@
1
+ import logging
2
+
1
3
  from openai.types.chat.chat_completion import ChatCompletion
2
4
  from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
3
5
  from typing import Optional, Tuple, Union, Dict
4
- from openai import OpenAI, AsyncOpenAI
5
6
  from pydantic import BaseModel
6
- import logging
7
- import openai
8
7
 
9
- from tenacity import (
10
- retry,
11
- retry_if_exception_type,
12
- wait_exponential_jitter,
13
- RetryCallState,
8
+ from openai import (
9
+ OpenAI,
10
+ AsyncOpenAI,
14
11
  )
15
12
 
13
+ from tenacity import retry, RetryCallState, before_sleep_log
14
+
16
15
  from deepeval.models import DeepEvalBaseLLM
17
16
  from deepeval.models.llms.utils import trim_and_load_json
18
17
  from deepeval.models.utils import parse_model_name
18
+ from deepeval.models.retry_policy import (
19
+ OPENAI_ERROR_POLICY,
20
+ default_wait,
21
+ default_stop,
22
+ retry_predicate,
23
+ )
24
+
25
+ logger = logging.getLogger("deepeval.openai_model")
19
26
 
20
27
 
21
28
  def log_retry_error(retry_state: RetryCallState):
22
29
  exception = retry_state.outcome.exception()
23
- logging.error(
30
+ logger.error(
24
31
  f"OpenAI Error: {exception} Retrying: {retry_state.attempt_number} time(s)..."
25
32
  )
26
33
 
@@ -212,14 +219,22 @@ models_requiring_temperature_1 = [
212
219
  "gpt-5-chat-latest",
213
220
  ]
214
221
 
215
- retryable_exceptions = (
216
- openai.RateLimitError,
217
- openai.APIConnectionError,
218
- openai.APITimeoutError,
219
- openai.LengthFinishReasonError,
222
+ _base_retry_rules_kw = dict(
223
+ wait=default_wait(),
224
+ stop=default_stop(),
225
+ retry=retry_predicate(OPENAI_ERROR_POLICY),
226
+ before_sleep=before_sleep_log(
227
+ logger, logging.INFO
228
+ ), # <- logs only on retries
229
+ after=log_retry_error,
220
230
  )
221
231
 
222
232
 
233
+ def _openai_client_kwargs():
234
+ # Avoid double-retry at SDK layer by disabling the SDK's own retries so tenacity is the single source of truth for retry logic.
235
+ return {"max_retries": 0}
236
+
237
+
223
238
  class GPTModel(DeepEvalBaseLLM):
224
239
  def __init__(
225
240
  self,
@@ -296,11 +311,7 @@ class GPTModel(DeepEvalBaseLLM):
296
311
  # Generate functions
297
312
  ###############################################
298
313
 
299
- @retry(
300
- wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
301
- retry=retry_if_exception_type(retryable_exceptions),
302
- after=log_retry_error,
303
- )
314
+ @retry(**_base_retry_rules_kw)
304
315
  def generate(
305
316
  self, prompt: str, schema: Optional[BaseModel] = None
306
317
  ) -> Tuple[Union[str, Dict], float]:
@@ -359,11 +370,7 @@ class GPTModel(DeepEvalBaseLLM):
359
370
  else:
360
371
  return output, cost
361
372
 
362
- @retry(
363
- wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
364
- retry=retry_if_exception_type(retryable_exceptions),
365
- after=log_retry_error,
366
- )
373
+ @retry(**_base_retry_rules_kw)
367
374
  async def a_generate(
368
375
  self, prompt: str, schema: Optional[BaseModel] = None
369
376
  ) -> Tuple[Union[str, BaseModel], float]:
@@ -427,11 +434,7 @@ class GPTModel(DeepEvalBaseLLM):
427
434
  # Other generate functions
428
435
  ###############################################
429
436
 
430
- @retry(
431
- wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
432
- retry=retry_if_exception_type(retryable_exceptions),
433
- after=log_retry_error,
434
- )
437
+ @retry(**_base_retry_rules_kw)
435
438
  def generate_raw_response(
436
439
  self,
437
440
  prompt: str,
@@ -454,11 +457,7 @@ class GPTModel(DeepEvalBaseLLM):
454
457
 
455
458
  return completion, cost
456
459
 
457
- @retry(
458
- wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
459
- retry=retry_if_exception_type(retryable_exceptions),
460
- after=log_retry_error,
461
- )
460
+ @retry(**_base_retry_rules_kw)
462
461
  async def a_generate_raw_response(
463
462
  self,
464
463
  prompt: str,
@@ -481,11 +480,7 @@ class GPTModel(DeepEvalBaseLLM):
481
480
 
482
481
  return completion, cost
483
482
 
484
- @retry(
485
- wait=wait_exponential_jitter(initial=1, exp_base=2, jitter=2, max=10),
486
- retry=retry_if_exception_type(retryable_exceptions),
487
- after=log_retry_error,
488
- )
483
+ @retry(**_base_retry_rules_kw)
489
484
  def generate_samples(
490
485
  self, prompt: str, n: int, temperature: float
491
486
  ) -> Tuple[list[str], float]:
@@ -518,12 +513,13 @@ class GPTModel(DeepEvalBaseLLM):
518
513
  return self.model_name
519
514
 
520
515
  def load_model(self, async_mode: bool = False):
516
+ kwargs = {**self.kwargs, **_openai_client_kwargs()}
521
517
  if not async_mode:
522
518
  return OpenAI(
523
519
  api_key=self._openai_api_key,
524
520
  base_url=self.base_url,
525
- **self.kwargs,
521
+ **kwargs,
526
522
  )
527
523
  return AsyncOpenAI(
528
- api_key=self._openai_api_key, base_url=self.base_url, **self.kwargs
524
+ api_key=self._openai_api_key, base_url=self.base_url, **kwargs
529
525
  )
@@ -0,0 +1,280 @@
1
+ """Generic retry policy helpers for provider SDKs.
2
+
3
+ This module lets models define *what is transient* vs *non-retryable* (permanent) failure
4
+ without coupling to a specific SDK. You provide an `ErrorPolicy` describing
5
+ exception classes and special “non-retryable” error codes, such as quota-exhausted from OpenAI,
6
+ and get back a Tenacity predicate suitable for `retry_if_exception`.
7
+
8
+ Typical use:
9
+
10
+ # Import dependencies
11
+ from tenacity import retry, before_sleep_log
12
+ from deepeval.models.retry_policy import (
13
+ OPENAI_ERROR_POLICY, default_wait, default_stop, retry_predicate
14
+ )
15
+
16
+ # Define retry rule keywords
17
+ _retry_kw = dict(
18
+ wait=default_wait(),
19
+ stop=default_stop(),
20
+ retry=retry_predicate(OPENAI_ERROR_POLICY),
21
+ before_sleep=before_sleep_log(logger, logging.INFO), # <- Optional: logs only on retries
22
+ )
23
+
24
+ # Apply retry rule keywords where desired
25
+ @retry(**_retry_kw)
26
+ def call_openai(...):
27
+ ...
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import logging
33
+
34
+ from deepeval.utils import read_env_int, read_env_float
35
+ from dataclasses import dataclass, field
36
+ from typing import Iterable, Mapping, Callable, Sequence, Tuple
37
+ from collections.abc import Mapping as ABCMapping
38
+ from tenacity import (
39
+ wait_exponential_jitter,
40
+ stop_after_attempt,
41
+ retry_if_exception,
42
+ )
43
+
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ # --------------------------
48
+ # Policy description
49
+ # --------------------------
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class ErrorPolicy:
54
+ """Describe exception classes & rules for retry classification.
55
+
56
+ Attributes:
57
+ auth_excs: Exceptions that indicate authentication/authorization problems.
58
+ These are treated as non-retryable.
59
+ rate_limit_excs: Exceptions representing rate limiting (HTTP 429).
60
+ network_excs: Exceptions for timeouts / connection issues (transient).
61
+ http_excs: Exceptions carrying an integer `status_code` (4xx, 5xx)
62
+ non_retryable_codes: Error “code” strings that should be considered permanent,
63
+ such as "insufficient_quota". Used to refine rate-limit handling.
64
+ retry_5xx: Whether to retry provider 5xx responses (defaults to True).
65
+ """
66
+
67
+ auth_excs: Tuple[type[Exception], ...]
68
+ rate_limit_excs: Tuple[type[Exception], ...]
69
+ network_excs: Tuple[type[Exception], ...]
70
+ http_excs: Tuple[type[Exception], ...]
71
+ non_retryable_codes: frozenset[str] = field(default_factory=frozenset)
72
+ retry_5xx: bool = True
73
+ message_markers: Mapping[str, Iterable[str]] = field(default_factory=dict)
74
+
75
+
76
+ # --------------------------
77
+ # Extraction helpers
78
+ # --------------------------
79
+
80
+
81
+ def extract_error_code(
82
+ e: Exception,
83
+ *,
84
+ response_attr: str = "response",
85
+ body_attr: str = "body",
86
+ code_path: Sequence[str] = ("error", "code"),
87
+ message_markers: Mapping[str, Iterable[str]] | None = None,
88
+ ) -> str:
89
+ """Best effort extraction of an error 'code' for SDK compatibility.
90
+
91
+ Order of attempts:
92
+ 1) Structured JSON via `e.response.json()` (typical HTTP error payload).
93
+ 2) A dict stored on `e.body` (some gateways/proxies use this).
94
+ 3) Message sniffing fallback, using `message_markers`.
95
+
96
+ Args:
97
+ e: The exception raised by the SDK/provider client.
98
+ response_attr: Attribute name that holds an HTTP response object.
99
+ body_attr: Attribute name that may hold a parsed payload (dict).
100
+ code_path: Path of keys to traverse to the code (e.g., ["error", "code"]).
101
+ message_markers: Mapping from canonical code -> substrings to search for.
102
+
103
+ Returns:
104
+ The code string if found, else "".
105
+ """
106
+ # 1) Structured JSON in e.response.json()
107
+ resp = getattr(e, response_attr, None)
108
+ if resp is not None:
109
+ try:
110
+ cur = resp.json()
111
+ for k in code_path:
112
+ if not isinstance(cur, ABCMapping):
113
+ cur = {}
114
+ break
115
+ cur = cur.get(k, {})
116
+ if isinstance(cur, (str, int)):
117
+ return str(cur)
118
+ except Exception:
119
+ # response.json() can raise; ignore and fall through
120
+ pass
121
+
122
+ # 2) SDK provided dict body
123
+ body = getattr(e, body_attr, None)
124
+ if isinstance(body, ABCMapping):
125
+ cur = body
126
+ for k in code_path:
127
+ if not isinstance(cur, ABCMapping):
128
+ cur = {}
129
+ break
130
+ cur = cur.get(k, {})
131
+ if isinstance(cur, (str, int)):
132
+ return str(cur)
133
+
134
+ # 3) Message sniff (hopefully this helps catch message codes that slip past the previous 2 parsers)
135
+ msg = str(e).lower()
136
+ markers = message_markers or {}
137
+ for code_key, needles in markers.items():
138
+ if any(n in msg for n in needles):
139
+ return code_key
140
+
141
+ return ""
142
+
143
+
144
+ # --------------------------
145
+ # Predicate factory
146
+ # --------------------------
147
+
148
+
149
+ def make_is_transient(
150
+ policy: ErrorPolicy,
151
+ *,
152
+ message_markers: Mapping[str, Iterable[str]] | None = None,
153
+ extra_non_retryable_codes: Iterable[str] = (),
154
+ ) -> Callable[[Exception], bool]:
155
+ """Create a Tenacity predicate: True = retry, False = surface immediately.
156
+
157
+ Semantics:
158
+ - Auth errors: non-retryable.
159
+ - Rate limit errors: retry unless the extracted code is in the non-retryable set
160
+ - Network/timeout errors: retry.
161
+ - HTTP errors with a `status_code`: retry 5xx if `policy.retry_5xx` is True.
162
+ - Everything else: treated as non-retryable.
163
+
164
+ Args:
165
+ policy: An ErrorPolicy describing error classes and rules.
166
+ message_markers: Optional override/extension for code inference via message text.
167
+ extra_non_retryable_codes: Additional code strings to treat as non-retryable.
168
+
169
+ Returns:
170
+ A callable `predicate(e) -> bool` suitable for `retry_if_exception`.
171
+ """
172
+ non_retryable = frozenset(policy.non_retryable_codes) | frozenset(
173
+ extra_non_retryable_codes
174
+ )
175
+
176
+ def _pred(e: Exception) -> bool:
177
+ if isinstance(e, policy.auth_excs):
178
+ return False
179
+
180
+ if isinstance(e, policy.rate_limit_excs):
181
+ code = extract_error_code(
182
+ e, message_markers=(message_markers or policy.message_markers)
183
+ )
184
+ return code not in non_retryable
185
+
186
+ if isinstance(e, policy.network_excs):
187
+ return True
188
+
189
+ if isinstance(e, policy.http_excs):
190
+ try:
191
+ sc = int(getattr(e, "status_code", 0))
192
+ except Exception:
193
+ sc = 0
194
+ return policy.retry_5xx and 500 <= sc < 600
195
+
196
+ return False
197
+
198
+ return _pred
199
+
200
+
201
+ # --------------------------
202
+ # Tenacity convenience
203
+ # --------------------------
204
+
205
+
206
+ def default_wait():
207
+ """Default backoff: exponential with jitter, capped.
208
+ Overridable via env:
209
+ - DEEPEVAL_RETRY_INITIAL_SECONDS (>=0)
210
+ - DEEPEVAL_RETRY_EXP_BASE (>=1)
211
+ - DEEPEVAL_RETRY_JITTER (>=0)
212
+ - DEEPEVAL_RETRY_CAP_SECONDS (>=0)
213
+ """
214
+ initial = read_env_float(
215
+ "DEEPEVAL_RETRY_INITIAL_SECONDS", 1.0, min_value=0.0
216
+ )
217
+ exp_base = read_env_float("DEEPEVAL_RETRY_EXP_BASE", 2.0, min_value=1.0)
218
+ jitter = read_env_float("DEEPEVAL_RETRY_JITTER", 2.0, min_value=0.0)
219
+ cap = read_env_float("DEEPEVAL_RETRY_CAP_SECONDS", 5.0, min_value=0.0)
220
+ return wait_exponential_jitter(
221
+ initial=initial, exp_base=exp_base, jitter=jitter, max=cap
222
+ )
223
+
224
+
225
+ def default_stop():
226
+ """Default stop condition: at most N attempts (N-1 retries).
227
+ Overridable via env:
228
+ - DEEPEVAL_RETRY_MAX_ATTEMPTS (>=1)
229
+ """
230
+ attempts = read_env_int("DEEPEVAL_RETRY_MAX_ATTEMPTS", 2, min_value=1)
231
+ return stop_after_attempt(attempts)
232
+
233
+
234
+ def retry_predicate(policy: ErrorPolicy, **kw):
235
+ """Build a Tenacity `retry=` argument from a policy.
236
+
237
+ Example:
238
+ retry=retry_predicate(OPENAI_ERROR_POLICY, extra_non_retryable_codes=["some_code"])
239
+ """
240
+ return retry_if_exception(make_is_transient(policy, **kw))
241
+
242
+
243
+ # --------------------------
244
+ # Built-in policies
245
+ # --------------------------
246
+ OPENAI_MESSAGE_MARKERS: dict[str, tuple[str, ...]] = {
247
+ "insufficient_quota": ("insufficient_quota", "exceeded your current quota"),
248
+ }
249
+
250
+ try:
251
+ from openai import (
252
+ AuthenticationError,
253
+ RateLimitError,
254
+ APIConnectionError,
255
+ APITimeoutError,
256
+ APIStatusError,
257
+ )
258
+
259
+ OPENAI_ERROR_POLICY = ErrorPolicy(
260
+ auth_excs=(AuthenticationError,),
261
+ rate_limit_excs=(RateLimitError,),
262
+ network_excs=(APIConnectionError, APITimeoutError),
263
+ http_excs=(APIStatusError,),
264
+ non_retryable_codes=frozenset({"insufficient_quota"}),
265
+ message_markers=OPENAI_MESSAGE_MARKERS,
266
+ )
267
+ except Exception: # pragma: no cover - OpenAI may not be installed in some envs
268
+ OPENAI_ERROR_POLICY = None
269
+
270
+
271
+ __all__ = [
272
+ "ErrorPolicy",
273
+ "extract_error_code",
274
+ "make_is_transient",
275
+ "default_wait",
276
+ "default_stop",
277
+ "retry_predicate",
278
+ "OPENAI_MESSAGE_MARKERS",
279
+ "OPENAI_ERROR_POLICY",
280
+ ]
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field, replace
4
- from typing import Any, Optional, Awaitable, Callable
4
+ from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar
5
5
 
6
6
  from deepeval.tracing import observe
7
7
  from deepeval.prompt import Prompt
@@ -14,6 +14,8 @@ except Exception as e:
14
14
  "openai-agents is required for this integration. Please install it."
15
15
  ) from e
16
16
 
17
+ TContext = TypeVar("TContext")
18
+
17
19
 
18
20
  class _ObservedModel(Model):
19
21
  def __init__(
@@ -153,7 +155,7 @@ class _ObservedProvider(ModelProvider):
153
155
 
154
156
 
155
157
  @dataclass
156
- class DeepEvalAgent(BaseAgent[Any]):
158
+ class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
157
159
  """
158
160
  A subclass of agents.Agent that accepts `metrics` and `metric_collection`
159
161
  and ensures the underlying model's `get_response` is wrapped with deepeval.observe.
deepeval/test_run/api.py CHANGED
@@ -50,6 +50,7 @@ class LLMApiTestCase(BaseModel):
50
50
  trace: Optional[TraceApi] = Field(None)
51
51
 
52
52
  model_config = ConfigDict(arbitrary_types_allowed=True)
53
+ # metric_collection: Optional[str] = Field(None, alias="metricCollection")
53
54
 
54
55
  def update_metric_data(self, metric_data: MetricData):
55
56
  if self.metrics_data is None:
@@ -30,6 +30,7 @@ from deepeval.tracing.otel.utils import (
30
30
  to_hex_string,
31
31
  parse_string,
32
32
  parse_list_of_strings,
33
+ post_test_run,
33
34
  )
34
35
  from deepeval.tracing import perf_epoch_bridge as peb
35
36
  from deepeval.tracing.types import TraceAttributes
@@ -80,7 +81,8 @@ class ConfidentSpanExporter(SpanExporter):
80
81
  self,
81
82
  spans: typing.Sequence[ReadableSpan],
82
83
  timeout_millis: int = 30000,
83
- api_key: Optional[str] = None, # dynamic api key
84
+ api_key: Optional[str] = None, # dynamic api key,
85
+ _test_run_id: Optional[str] = None,
84
86
  ) -> SpanExportResult:
85
87
  # build forest of spans
86
88
  forest = self._build_span_forest(spans)
@@ -223,14 +225,24 @@ class ConfidentSpanExporter(SpanExporter):
223
225
  trace_manager.add_span_to_trace(base_span_wrapper.base_span)
224
226
  # no removing span because it can be parent of other spans
225
227
 
226
- # safely end all active traces
228
+ # safely end all active traces or return them for test runs
227
229
  active_traces_keys = list(trace_manager.active_traces.keys())
228
- for trace_key in active_traces_keys:
229
- set_trace_time(trace_manager.get_trace_by_uuid(trace_key))
230
- trace_manager.end_trace(trace_key)
231
- trace_manager.clear_traces()
232
-
233
- return SpanExportResult.SUCCESS
230
+ if _test_run_id:
231
+ traces = []
232
+ for trace_key in active_traces_keys:
233
+ set_trace_time(trace_manager.get_trace_by_uuid(trace_key))
234
+ trace = trace_manager.get_trace_by_uuid(trace_key)
235
+ if trace:
236
+ traces.append(trace)
237
+ trace_manager.clear_traces()
238
+ post_test_run(traces, _test_run_id)
239
+ return SpanExportResult.SUCCESS
240
+ else:
241
+ for trace_key in active_traces_keys:
242
+ set_trace_time(trace_manager.get_trace_by_uuid(trace_key))
243
+ trace_manager.end_trace(trace_key)
244
+ trace_manager.clear_traces()
245
+ return SpanExportResult.SUCCESS
234
246
 
235
247
  def _convert_readable_span_to_base_span(
236
248
  self, span: ReadableSpan
@@ -1,5 +1,6 @@
1
1
  from typing import List, Optional, Tuple, Any
2
2
  from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
3
+ from deepeval.tracing import trace_manager, BaseSpan
3
4
  from opentelemetry.sdk.trace.export import ReadableSpan
4
5
  import json
5
6
 
@@ -250,3 +251,59 @@ def parse_list_of_strings(context: List[str]) -> List[str]:
250
251
  else:
251
252
  parsed_context.append(context_str)
252
253
  return parsed_context
254
+
255
+
256
+ from deepeval.evaluate.utils import create_api_test_case
257
+ from deepeval.test_run.api import LLMApiTestCase
258
+ from deepeval.test_run.test_run import global_test_run_manager
259
+ from typing import Optional
260
+
261
+
262
+ def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
263
+ # Accept single trace or list of traces
264
+ if isinstance(traces, Trace):
265
+ traces = [traces]
266
+
267
+ api_test_cases: List[LLMApiTestCase] = []
268
+
269
+ # Collect test cases from spans that have metric_collection
270
+ for trace in traces:
271
+ trace_api = trace_manager.create_trace_api(trace)
272
+
273
+ def dfs(span: BaseSpan):
274
+ if span.metric_collection:
275
+ llm_test_case = LLMTestCase(
276
+ input=str(span.input),
277
+ actual_output=(
278
+ str(span.output) if span.output is not None else None
279
+ ),
280
+ expected_output=span.expected_output,
281
+ context=span.context,
282
+ retrieval_context=span.retrieval_context,
283
+ tools_called=span.tools_called,
284
+ expected_tools=span.expected_tools,
285
+ )
286
+ api_case = create_api_test_case(
287
+ test_case=llm_test_case,
288
+ trace=trace_api,
289
+ index=None,
290
+ )
291
+ if isinstance(api_case, LLMApiTestCase):
292
+ api_case.metric_collection = span.metric_collection
293
+ api_test_cases.append(api_case)
294
+
295
+ for child in span.children or []:
296
+ dfs(child)
297
+
298
+ for root in trace.root_spans:
299
+ dfs(root)
300
+
301
+ # Prepare and post TestRun using the global test run manager
302
+ test_run_manager = global_test_run_manager
303
+ test_run_manager.create_test_run(identifier=test_run_id)
304
+ test_run = test_run_manager.get_test_run()
305
+
306
+ for case in api_test_cases:
307
+ test_run.add_test_case(case)
308
+
309
+ # return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
@@ -15,12 +15,12 @@ Usage:
15
15
 
16
16
  from __future__ import annotations
17
17
  import time
18
- from typing import Final
18
+ from typing import Final, Union
19
19
 
20
20
  # Module globals are initialised exactly once.
21
- _anchor_perf_ns: int | None = None
22
- _anchor_wall_ns: int | None = None
23
- _offset_ns: int | None = None
21
+ _anchor_perf_ns: Union[int, None] = None
22
+ _anchor_wall_ns: Union[int, None] = None
23
+ _offset_ns: Union[int, None] = None
24
24
 
25
25
 
26
26
  def init_clock_bridge() -> None: