deepeval 3.5.4__py3-none-any.whl → 3.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +14 -0
- deepeval/constants.py +2 -1
- deepeval/dataset/dataset.py +11 -4
- deepeval/dataset/types.py +19 -11
- deepeval/dataset/utils.py +31 -3
- deepeval/evaluate/execute.py +216 -17
- deepeval/openai_agents/agent.py +115 -106
- deepeval/openai_agents/callback_handler.py +21 -30
- deepeval/openai_agents/runner.py +288 -71
- deepeval/tracing/tracing.py +1 -3
- {deepeval-3.5.4.dist-info → deepeval-3.5.5.dist-info}/METADATA +3 -1
- {deepeval-3.5.4.dist-info → deepeval-3.5.5.dist-info}/RECORD +16 -16
- {deepeval-3.5.4.dist-info → deepeval-3.5.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.5.4.dist-info → deepeval-3.5.5.dist-info}/WHEEL +0 -0
- {deepeval-3.5.4.dist-info → deepeval-3.5.5.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.5.
|
|
1
|
+
__version__: str = "3.5.5"
|
deepeval/config/settings.py
CHANGED
|
@@ -281,6 +281,7 @@ class Settings(BaseSettings):
|
|
|
281
281
|
#
|
|
282
282
|
# Telemetry and Debug
|
|
283
283
|
#
|
|
284
|
+
DEEPEVAL_DEBUG_ASYNC: Optional[bool] = None
|
|
284
285
|
DEEPEVAL_TELEMETRY_OPT_OUT: Optional[bool] = None
|
|
285
286
|
DEEPEVAL_UPDATE_WARNING_OPT_IN: Optional[bool] = None
|
|
286
287
|
DEEPEVAL_GRPC_LOGGING: Optional[bool] = None
|
|
@@ -303,6 +304,19 @@ class Settings(BaseSettings):
|
|
|
303
304
|
MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
|
|
304
305
|
MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
|
|
305
306
|
|
|
307
|
+
#
|
|
308
|
+
# Async Task Configuration
|
|
309
|
+
#
|
|
310
|
+
|
|
311
|
+
# Maximum time allowed for a single task to complete
|
|
312
|
+
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS: int = (
|
|
313
|
+
300 # Set to float('inf') to disable timeout
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Buffer time for gathering results from all tasks, added to the longest task duration
|
|
317
|
+
# Increase if many tasks are running concurrently
|
|
318
|
+
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: int = 60
|
|
319
|
+
|
|
306
320
|
##############
|
|
307
321
|
# Validators #
|
|
308
322
|
##############
|
deepeval/constants.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
+
from typing import Union
|
|
2
3
|
|
|
3
4
|
KEY_FILE: str = ".deepeval"
|
|
4
5
|
HIDDEN_DIR: str = ".deepeval"
|
|
@@ -29,7 +30,7 @@ class ProviderSlug(str, Enum):
|
|
|
29
30
|
OLLAMA = "ollama"
|
|
30
31
|
|
|
31
32
|
|
|
32
|
-
def slugify(value: str
|
|
33
|
+
def slugify(value: Union[str, ProviderSlug]) -> str:
|
|
33
34
|
return (
|
|
34
35
|
value.value
|
|
35
36
|
if isinstance(value, ProviderSlug)
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from asyncio import Task
|
|
2
|
-
from typing import Iterator, List, Optional, Union, Literal
|
|
2
|
+
from typing import TYPE_CHECKING, Iterator, List, Optional, Union, Literal
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from opentelemetry.trace import Tracer
|
|
5
5
|
from opentelemetry.context import Context, attach, detach
|
|
@@ -7,7 +7,6 @@ from rich.console import Console
|
|
|
7
7
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
8
8
|
import json
|
|
9
9
|
import csv
|
|
10
|
-
import webbrowser
|
|
11
10
|
import os
|
|
12
11
|
import datetime
|
|
13
12
|
import time
|
|
@@ -17,6 +16,7 @@ from opentelemetry import baggage
|
|
|
17
16
|
|
|
18
17
|
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
19
18
|
from deepeval.dataset.utils import (
|
|
19
|
+
coerce_to_task,
|
|
20
20
|
convert_test_cases_to_goldens,
|
|
21
21
|
convert_goldens_to_test_cases,
|
|
22
22
|
convert_convo_goldens_to_convo_test_cases,
|
|
@@ -49,11 +49,18 @@ from deepeval.utils import (
|
|
|
49
49
|
from deepeval.test_run import (
|
|
50
50
|
global_test_run_manager,
|
|
51
51
|
)
|
|
52
|
-
from deepeval.dataset.types import global_evaluation_tasks
|
|
53
52
|
from deepeval.openai.utils import openai_test_case_pairs
|
|
54
53
|
from deepeval.tracing import trace_manager
|
|
55
54
|
from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
|
|
56
55
|
|
|
56
|
+
if TYPE_CHECKING:
|
|
57
|
+
from deepeval.evaluate.configs import (
|
|
58
|
+
AsyncConfig,
|
|
59
|
+
DisplayConfig,
|
|
60
|
+
CacheConfig,
|
|
61
|
+
ErrorConfig,
|
|
62
|
+
)
|
|
63
|
+
|
|
57
64
|
|
|
58
65
|
valid_file_types = ["csv", "json", "jsonl"]
|
|
59
66
|
|
|
@@ -1230,7 +1237,7 @@ class EvaluationDataset:
|
|
|
1230
1237
|
)
|
|
1231
1238
|
|
|
1232
1239
|
def evaluate(self, task: Task):
|
|
1233
|
-
|
|
1240
|
+
coerce_to_task(task)
|
|
1234
1241
|
|
|
1235
1242
|
def _start_otel_test_run(self, tracer: Optional[Tracer] = None) -> Context:
|
|
1236
1243
|
_tracer = check_tracer(tracer)
|
deepeval/dataset/types.py
CHANGED
|
@@ -1,17 +1,25 @@
|
|
|
1
|
-
|
|
2
|
-
tasks: list = []
|
|
1
|
+
import asyncio
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
from typing import Any
|
|
4
|
+
from deepeval.dataset.utils import coerce_to_task
|
|
6
5
|
|
|
7
|
-
def get_tasks(self):
|
|
8
|
-
return self.tasks
|
|
9
6
|
|
|
10
|
-
|
|
11
|
-
return len(self.tasks)
|
|
7
|
+
class EvaluationTasks:
|
|
12
8
|
|
|
13
|
-
def
|
|
14
|
-
self.
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self._tasks: list[asyncio.Future] = []
|
|
15
11
|
|
|
12
|
+
def append(self, obj: Any):
|
|
13
|
+
self._tasks.append(coerce_to_task(obj))
|
|
14
|
+
|
|
15
|
+
def get_tasks(self) -> list[asyncio.Future]:
|
|
16
|
+
return list(self._tasks)
|
|
17
|
+
|
|
18
|
+
def num_tasks(self):
|
|
19
|
+
return len(self._tasks)
|
|
16
20
|
|
|
17
|
-
|
|
21
|
+
def clear_tasks(self) -> None:
|
|
22
|
+
for t in self._tasks:
|
|
23
|
+
if not t.done():
|
|
24
|
+
t.cancel()
|
|
25
|
+
self._tasks.clear()
|
deepeval/dataset/utils.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
|
|
1
|
+
import asyncio
|
|
2
|
+
import inspect
|
|
2
3
|
import json
|
|
3
4
|
import re
|
|
4
5
|
|
|
6
|
+
from typing import List, Optional, Any
|
|
5
7
|
from opentelemetry.trace import Tracer
|
|
6
|
-
from opentelemetry import trace
|
|
7
|
-
from opentelemetry.trace import NoOpTracerProvider
|
|
8
8
|
|
|
9
9
|
from deepeval.dataset.api import Golden
|
|
10
10
|
from deepeval.dataset.golden import ConversationalGolden
|
|
@@ -174,3 +174,31 @@ def check_tracer(tracer: Optional[Tracer] = None) -> Tracer:
|
|
|
174
174
|
)
|
|
175
175
|
|
|
176
176
|
return GLOBAL_TEST_RUN_TRACER
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def coerce_to_task(obj: Any) -> asyncio.Future[Any]:
|
|
180
|
+
# already a Task so just return it
|
|
181
|
+
if isinstance(obj, asyncio.Task):
|
|
182
|
+
return obj
|
|
183
|
+
|
|
184
|
+
# If it is a future, it is already scheduled, so just return it
|
|
185
|
+
if asyncio.isfuture(obj):
|
|
186
|
+
# type: ignore[return-value] # it is an awaitable, gather accepts it
|
|
187
|
+
return obj
|
|
188
|
+
|
|
189
|
+
# bare coroutine must be explicitly scheduled using create_task to bind to loop & track
|
|
190
|
+
if asyncio.iscoroutine(obj):
|
|
191
|
+
return asyncio.create_task(obj)
|
|
192
|
+
|
|
193
|
+
# generic awaitable (any object with __await__) will need to be wrapped so create_task accepts it
|
|
194
|
+
if inspect.isawaitable(obj):
|
|
195
|
+
|
|
196
|
+
async def _wrap(awaitable):
|
|
197
|
+
return await awaitable
|
|
198
|
+
|
|
199
|
+
return asyncio.create_task(_wrap(obj))
|
|
200
|
+
|
|
201
|
+
# not awaitable, so time to sound the alarm!
|
|
202
|
+
raise TypeError(
|
|
203
|
+
f"Expected Task/Future/coroutine/awaitable, got {type(obj).__name__}"
|
|
204
|
+
)
|
deepeval/evaluate/execute.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
1
3
|
from rich.progress import (
|
|
2
4
|
Progress,
|
|
3
5
|
TextColumn,
|
|
@@ -40,7 +42,6 @@ from deepeval.tracing.api import (
|
|
|
40
42
|
BaseApiSpan,
|
|
41
43
|
)
|
|
42
44
|
from deepeval.dataset import Golden
|
|
43
|
-
from deepeval.dataset.types import global_evaluation_tasks
|
|
44
45
|
from deepeval.errors import MissingTestCaseParamsError
|
|
45
46
|
from deepeval.metrics.utils import copy_metrics
|
|
46
47
|
from deepeval.utils import (
|
|
@@ -87,6 +88,17 @@ from deepeval.evaluate.utils import (
|
|
|
87
88
|
from deepeval.utils import add_pbar, update_pbar, custom_console
|
|
88
89
|
from deepeval.openai.utils import openai_test_case_pairs
|
|
89
90
|
from deepeval.tracing.types import TestCaseMetricPair
|
|
91
|
+
from deepeval.config.settings import get_settings
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
logger = logging.getLogger(__name__)
|
|
95
|
+
settings = get_settings()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
async def _snapshot_tasks():
|
|
99
|
+
cur = asyncio.current_task()
|
|
100
|
+
# `all_tasks` returns tasks for the current running loop only
|
|
101
|
+
return {t for t in asyncio.all_tasks() if t is not cur}
|
|
90
102
|
|
|
91
103
|
|
|
92
104
|
###########################################
|
|
@@ -112,7 +124,7 @@ def execute_test_cases(
|
|
|
112
124
|
_is_assert_test: bool = False,
|
|
113
125
|
) -> List[TestResult]:
|
|
114
126
|
global_test_run_cache_manager.disable_write_cache = (
|
|
115
|
-
cache_config.write_cache
|
|
127
|
+
cache_config.write_cache is False
|
|
116
128
|
)
|
|
117
129
|
|
|
118
130
|
if test_run_manager is None:
|
|
@@ -357,7 +369,7 @@ async def a_execute_test_cases(
|
|
|
357
369
|
return await func(*args, **kwargs)
|
|
358
370
|
|
|
359
371
|
global_test_run_cache_manager.disable_write_cache = (
|
|
360
|
-
cache_config.write_cache
|
|
372
|
+
cache_config.write_cache is False
|
|
361
373
|
)
|
|
362
374
|
if test_run_manager is None:
|
|
363
375
|
test_run_manager = global_test_run_manager
|
|
@@ -1041,7 +1053,7 @@ def execute_agentic_test_cases(
|
|
|
1041
1053
|
with progress:
|
|
1042
1054
|
pbar_id = add_pbar(
|
|
1043
1055
|
progress,
|
|
1044
|
-
|
|
1056
|
+
"Running Component-Level Evals (sync)",
|
|
1045
1057
|
total=len(goldens) * 2,
|
|
1046
1058
|
)
|
|
1047
1059
|
evaluate_test_cases(progress=progress, pbar_id=pbar_id)
|
|
@@ -1551,7 +1563,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1551
1563
|
tools_called=span.tools_called,
|
|
1552
1564
|
expected_tools=span.expected_tools,
|
|
1553
1565
|
)
|
|
1554
|
-
if span.metrics
|
|
1566
|
+
if span.metrics is None or llm_test_case is None:
|
|
1555
1567
|
return
|
|
1556
1568
|
|
|
1557
1569
|
has_task_completion = any(
|
|
@@ -1692,7 +1704,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1692
1704
|
with progress:
|
|
1693
1705
|
pbar_id = add_pbar(
|
|
1694
1706
|
progress,
|
|
1695
|
-
|
|
1707
|
+
"Running Component-Level Evals (sync)",
|
|
1696
1708
|
total=len(goldens) * 2,
|
|
1697
1709
|
)
|
|
1698
1710
|
yield from evaluate_test_cases(
|
|
@@ -1722,6 +1734,11 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1722
1734
|
_is_assert_test: bool = False,
|
|
1723
1735
|
) -> Iterator[TestResult]:
|
|
1724
1736
|
|
|
1737
|
+
GATHER_TIMEOUT_SECONDS = (
|
|
1738
|
+
settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1739
|
+
+ settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
1740
|
+
)
|
|
1741
|
+
|
|
1725
1742
|
semaphore = asyncio.Semaphore(async_config.max_concurrent)
|
|
1726
1743
|
original_create_task = asyncio.create_task
|
|
1727
1744
|
|
|
@@ -1735,43 +1752,225 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1735
1752
|
|
|
1736
1753
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
1737
1754
|
async with semaphore:
|
|
1738
|
-
return await
|
|
1755
|
+
return await asyncio.wait_for(
|
|
1756
|
+
coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1757
|
+
)
|
|
1739
1758
|
|
|
1740
1759
|
def evaluate_test_cases(
|
|
1741
1760
|
progress: Optional[Progress] = None,
|
|
1742
1761
|
pbar_id: Optional[int] = None,
|
|
1743
1762
|
pbar_callback_id: Optional[int] = None,
|
|
1744
1763
|
):
|
|
1764
|
+
# Tasks we scheduled during this iterator run on this event loop.
|
|
1765
|
+
# by gathering these tasks we can avoid re-awaiting coroutines which
|
|
1766
|
+
# can cause cross loop mixups that trigger "future belongs to a different loop" errors
|
|
1767
|
+
created_tasks: list[asyncio.Task] = []
|
|
1768
|
+
task_meta: dict[asyncio.Task, dict] = {}
|
|
1769
|
+
current_golden_ctx = {"index": -1, "name": None, "input": None}
|
|
1770
|
+
|
|
1745
1771
|
def create_callback_task(coro, **kwargs):
|
|
1746
|
-
task
|
|
1772
|
+
# build a descriptive task name for tracking
|
|
1773
|
+
coro_desc = repr(coro)
|
|
1774
|
+
task_name = f"callback[{current_golden_ctx['index']}]:{coro_desc.split()[1] if ' ' in coro_desc else coro_desc}"
|
|
1775
|
+
|
|
1776
|
+
# Wrap the user coroutine in our semaphore runner and bind it to THIS loop.
|
|
1777
|
+
# Keep the resulting Task so we can gather tasks (not raw coroutines) later,
|
|
1778
|
+
# without touching tasks from other loops or already awaited coroutines.
|
|
1779
|
+
task = loop.create_task(
|
|
1780
|
+
execute_callback_with_semaphore(coro), name=task_name
|
|
1781
|
+
)
|
|
1782
|
+
|
|
1783
|
+
# record metadata for debugging
|
|
1784
|
+
MAX_META_INPUT_LENGTH = 120
|
|
1785
|
+
started = time.perf_counter()
|
|
1786
|
+
short_input = current_golden_ctx["input"]
|
|
1787
|
+
if (
|
|
1788
|
+
isinstance(short_input, str)
|
|
1789
|
+
and len(short_input) > MAX_META_INPUT_LENGTH
|
|
1790
|
+
):
|
|
1791
|
+
short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
|
|
1792
|
+
task_meta[task] = {
|
|
1793
|
+
"golden_index": current_golden_ctx["index"],
|
|
1794
|
+
"golden_name": current_golden_ctx["name"],
|
|
1795
|
+
"input": short_input,
|
|
1796
|
+
"coro": coro_desc,
|
|
1797
|
+
"started": started,
|
|
1798
|
+
}
|
|
1747
1799
|
|
|
1748
1800
|
def on_task_done(t: asyncio.Task):
|
|
1801
|
+
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1802
|
+
# Using info level here to make it easy to spot these logs.
|
|
1803
|
+
# We are gated by DEEPEVAL_DEBUG_ASYNC
|
|
1804
|
+
meta = task_meta.get(t, {})
|
|
1805
|
+
duration = time.perf_counter() - meta.get(
|
|
1806
|
+
"started", started
|
|
1807
|
+
)
|
|
1808
|
+
|
|
1809
|
+
if t.cancelled():
|
|
1810
|
+
logger.info(
|
|
1811
|
+
"[deepeval] task CANCELLED %s after %.2fs meta=%r",
|
|
1812
|
+
t.get_name(),
|
|
1813
|
+
duration,
|
|
1814
|
+
meta,
|
|
1815
|
+
)
|
|
1816
|
+
else:
|
|
1817
|
+
exc = t.exception()
|
|
1818
|
+
if exc is not None:
|
|
1819
|
+
logger.error(
|
|
1820
|
+
"[deepeval] task ERROR %s after %.2fs meta=%r",
|
|
1821
|
+
t.get_name(),
|
|
1822
|
+
duration,
|
|
1823
|
+
meta,
|
|
1824
|
+
exc_info=(type(exc), exc, exc.__traceback__),
|
|
1825
|
+
)
|
|
1826
|
+
else:
|
|
1827
|
+
logger.info(
|
|
1828
|
+
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
1829
|
+
t.get_name(),
|
|
1830
|
+
duration,
|
|
1831
|
+
meta.get("golden_index"),
|
|
1832
|
+
)
|
|
1833
|
+
|
|
1749
1834
|
update_pbar(progress, pbar_callback_id)
|
|
1750
1835
|
update_pbar(progress, pbar_id)
|
|
1751
1836
|
|
|
1752
1837
|
task.add_done_callback(on_task_done)
|
|
1838
|
+
created_tasks.append(task)
|
|
1753
1839
|
return task
|
|
1754
1840
|
|
|
1755
1841
|
asyncio.create_task = create_callback_task
|
|
1842
|
+
# DEBUG
|
|
1843
|
+
# Snapshot tasks that already exist on this loop so we can detect strays
|
|
1844
|
+
baseline_tasks = loop.run_until_complete(_snapshot_tasks())
|
|
1756
1845
|
|
|
1757
1846
|
try:
|
|
1758
|
-
for golden in goldens:
|
|
1847
|
+
for index, golden in enumerate(goldens):
|
|
1848
|
+
current_golden_ctx.update(
|
|
1849
|
+
{
|
|
1850
|
+
"index": index,
|
|
1851
|
+
"name": getattr(golden, "name", None),
|
|
1852
|
+
"input": getattr(golden, "input", None),
|
|
1853
|
+
}
|
|
1854
|
+
)
|
|
1855
|
+
prev_task_length = len(created_tasks)
|
|
1759
1856
|
yield golden
|
|
1760
|
-
if
|
|
1857
|
+
# if this golden created no tasks, bump bars now
|
|
1858
|
+
if len(created_tasks) == prev_task_length:
|
|
1761
1859
|
update_pbar(progress, pbar_callback_id)
|
|
1762
1860
|
update_pbar(progress, pbar_id)
|
|
1763
1861
|
finally:
|
|
1764
1862
|
asyncio.create_task = original_create_task
|
|
1765
1863
|
|
|
1766
|
-
if
|
|
1767
|
-
loop.
|
|
1768
|
-
|
|
1769
|
-
|
|
1864
|
+
if created_tasks:
|
|
1865
|
+
# Only await tasks we created on this loop in this run.
|
|
1866
|
+
# This will prevent re-awaiting and avoids cross loop "future belongs to a different loop" errors
|
|
1867
|
+
try:
|
|
1868
|
+
loop.run_until_complete(
|
|
1869
|
+
asyncio.wait_for(
|
|
1870
|
+
asyncio.gather(*created_tasks, return_exceptions=True),
|
|
1871
|
+
timeout=GATHER_TIMEOUT_SECONDS,
|
|
1872
|
+
)
|
|
1770
1873
|
)
|
|
1771
|
-
|
|
1874
|
+
except asyncio.TimeoutError:
|
|
1875
|
+
import traceback
|
|
1876
|
+
|
|
1877
|
+
pending = [t for t in created_tasks if not t.done()]
|
|
1878
|
+
|
|
1879
|
+
# Log the elapsed time for each task that was pending
|
|
1880
|
+
for t in pending:
|
|
1881
|
+
meta = task_meta.get(t, {})
|
|
1882
|
+
start_time = meta.get("started", time.perf_counter())
|
|
1883
|
+
elapsed_time = time.perf_counter() - start_time
|
|
1884
|
+
|
|
1885
|
+
# Determine if it was a per task or gather timeout based on task's elapsed time
|
|
1886
|
+
if (
|
|
1887
|
+
elapsed_time
|
|
1888
|
+
>= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1889
|
+
):
|
|
1890
|
+
timeout_type = "per-task"
|
|
1891
|
+
else:
|
|
1892
|
+
timeout_type = "gather"
|
|
1893
|
+
|
|
1894
|
+
logger.warning(
|
|
1895
|
+
f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
|
|
1896
|
+
f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
|
|
1897
|
+
f"To give tasks more time, consider increasing "
|
|
1898
|
+
f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
|
|
1899
|
+
f"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS to allow more time for gathering results."
|
|
1900
|
+
)
|
|
1901
|
+
|
|
1902
|
+
# Log pending tasks and their stack traces
|
|
1903
|
+
logger.info(
|
|
1904
|
+
" - PENDING %s elapsed_time=%.2fs meta=%s",
|
|
1905
|
+
t.get_name(),
|
|
1906
|
+
elapsed_time,
|
|
1907
|
+
meta,
|
|
1908
|
+
)
|
|
1909
|
+
if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1910
|
+
frames = t.get_stack(limit=6)
|
|
1911
|
+
if frames:
|
|
1912
|
+
logger.info(" stack:")
|
|
1913
|
+
for fr in frames:
|
|
1914
|
+
for line in traceback.format_stack(fr):
|
|
1915
|
+
logger.info(" " + line.rstrip())
|
|
1916
|
+
|
|
1917
|
+
# Cancel and drain the tasks
|
|
1918
|
+
for t in pending:
|
|
1919
|
+
t.cancel()
|
|
1920
|
+
loop.run_until_complete(
|
|
1921
|
+
asyncio.gather(*created_tasks, return_exceptions=True)
|
|
1922
|
+
)
|
|
1923
|
+
finally:
|
|
1924
|
+
|
|
1925
|
+
# if it is already closed, we are done
|
|
1926
|
+
if loop.is_closed():
|
|
1927
|
+
return
|
|
1928
|
+
|
|
1929
|
+
try:
|
|
1930
|
+
# Find tasks that were created during this run but we didn’t track
|
|
1931
|
+
current_tasks = loop.run_until_complete(_snapshot_tasks())
|
|
1932
|
+
except RuntimeError:
|
|
1933
|
+
# this might happen if the loop is already closing
|
|
1934
|
+
# nothing we can do
|
|
1935
|
+
return
|
|
1936
|
+
|
|
1937
|
+
leftovers = [
|
|
1938
|
+
t
|
|
1939
|
+
for t in current_tasks
|
|
1940
|
+
if t not in baseline_tasks
|
|
1941
|
+
and t not in created_tasks
|
|
1942
|
+
and not t.done()
|
|
1943
|
+
]
|
|
1944
|
+
|
|
1945
|
+
if not leftovers:
|
|
1946
|
+
return
|
|
1947
|
+
|
|
1948
|
+
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1949
|
+
logger.warning(
|
|
1950
|
+
"[deepeval] %d stray task(s) not tracked; cancelling…",
|
|
1951
|
+
len(leftovers),
|
|
1952
|
+
)
|
|
1953
|
+
for t in leftovers:
|
|
1954
|
+
meta = task_meta.get(t, {})
|
|
1955
|
+
name = t.get_name()
|
|
1956
|
+
logger.warning(" - STRAY %s meta=%s", name, meta)
|
|
1957
|
+
|
|
1958
|
+
for t in leftovers:
|
|
1959
|
+
t.cancel()
|
|
1960
|
+
|
|
1961
|
+
# Drain strays so they don’t leak into the next iteration
|
|
1962
|
+
try:
|
|
1963
|
+
loop.run_until_complete(
|
|
1964
|
+
asyncio.gather(*leftovers, return_exceptions=True)
|
|
1965
|
+
)
|
|
1966
|
+
except RuntimeError:
|
|
1967
|
+
# If the loop is closing here, just continue
|
|
1968
|
+
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1969
|
+
logger.warning(
|
|
1970
|
+
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
1971
|
+
)
|
|
1772
1972
|
|
|
1773
1973
|
# Evaluate traces
|
|
1774
|
-
asyncio.create_task = loop.create_task
|
|
1775
1974
|
if trace_manager.traces_to_evaluate:
|
|
1776
1975
|
loop.run_until_complete(
|
|
1777
1976
|
_a_evaluate_traces(
|
|
@@ -1863,7 +2062,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1863
2062
|
with progress:
|
|
1864
2063
|
pbar_id = add_pbar(
|
|
1865
2064
|
progress,
|
|
1866
|
-
|
|
2065
|
+
"Running Component-Level Evals (async)",
|
|
1867
2066
|
total=len(goldens) * 2,
|
|
1868
2067
|
)
|
|
1869
2068
|
pbar_callback_id = add_pbar(
|