deepeval 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +182 -18
- deepeval/config/settings.py +14 -0
- deepeval/constants.py +2 -1
- deepeval/dataset/dataset.py +11 -4
- deepeval/dataset/types.py +19 -11
- deepeval/dataset/utils.py +31 -3
- deepeval/evaluate/execute.py +226 -23
- deepeval/openai_agents/agent.py +115 -106
- deepeval/openai_agents/callback_handler.py +65 -33
- deepeval/openai_agents/runner.py +296 -75
- deepeval/scorer/scorer.py +2 -2
- deepeval/tracing/tracing.py +1 -3
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/METADATA +3 -1
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/RECORD +18 -18
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/WHEEL +0 -0
- {deepeval-3.5.4.dist-info → deepeval-3.5.6.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
1
3
|
from rich.progress import (
|
|
2
4
|
Progress,
|
|
3
5
|
TextColumn,
|
|
@@ -40,7 +42,6 @@ from deepeval.tracing.api import (
|
|
|
40
42
|
BaseApiSpan,
|
|
41
43
|
)
|
|
42
44
|
from deepeval.dataset import Golden
|
|
43
|
-
from deepeval.dataset.types import global_evaluation_tasks
|
|
44
45
|
from deepeval.errors import MissingTestCaseParamsError
|
|
45
46
|
from deepeval.metrics.utils import copy_metrics
|
|
46
47
|
from deepeval.utils import (
|
|
@@ -87,6 +88,17 @@ from deepeval.evaluate.utils import (
|
|
|
87
88
|
from deepeval.utils import add_pbar, update_pbar, custom_console
|
|
88
89
|
from deepeval.openai.utils import openai_test_case_pairs
|
|
89
90
|
from deepeval.tracing.types import TestCaseMetricPair
|
|
91
|
+
from deepeval.config.settings import get_settings
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
logger = logging.getLogger(__name__)
|
|
95
|
+
settings = get_settings()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
async def _snapshot_tasks():
|
|
99
|
+
cur = asyncio.current_task()
|
|
100
|
+
# `all_tasks` returns tasks for the current running loop only
|
|
101
|
+
return {t for t in asyncio.all_tasks() if t is not cur}
|
|
90
102
|
|
|
91
103
|
|
|
92
104
|
###########################################
|
|
@@ -112,7 +124,7 @@ def execute_test_cases(
|
|
|
112
124
|
_is_assert_test: bool = False,
|
|
113
125
|
) -> List[TestResult]:
|
|
114
126
|
global_test_run_cache_manager.disable_write_cache = (
|
|
115
|
-
cache_config.write_cache
|
|
127
|
+
cache_config.write_cache is False
|
|
116
128
|
)
|
|
117
129
|
|
|
118
130
|
if test_run_manager is None:
|
|
@@ -357,7 +369,7 @@ async def a_execute_test_cases(
|
|
|
357
369
|
return await func(*args, **kwargs)
|
|
358
370
|
|
|
359
371
|
global_test_run_cache_manager.disable_write_cache = (
|
|
360
|
-
cache_config.write_cache
|
|
372
|
+
cache_config.write_cache is False
|
|
361
373
|
)
|
|
362
374
|
if test_run_manager is None:
|
|
363
375
|
test_run_manager = global_test_run_manager
|
|
@@ -1041,7 +1053,7 @@ def execute_agentic_test_cases(
|
|
|
1041
1053
|
with progress:
|
|
1042
1054
|
pbar_id = add_pbar(
|
|
1043
1055
|
progress,
|
|
1044
|
-
|
|
1056
|
+
"Running Component-Level Evals (sync)",
|
|
1045
1057
|
total=len(goldens) * 2,
|
|
1046
1058
|
)
|
|
1047
1059
|
evaluate_test_cases(progress=progress, pbar_id=pbar_id)
|
|
@@ -1207,12 +1219,16 @@ async def _a_execute_agentic_test_case(
|
|
|
1207
1219
|
|
|
1208
1220
|
test_case = LLMTestCase(
|
|
1209
1221
|
input=golden.input,
|
|
1210
|
-
actual_output=
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1222
|
+
actual_output=(
|
|
1223
|
+
str(current_trace.output)
|
|
1224
|
+
if current_trace.output is not None
|
|
1225
|
+
else None
|
|
1226
|
+
),
|
|
1227
|
+
expected_output=current_trace.expected_output,
|
|
1228
|
+
context=current_trace.context,
|
|
1229
|
+
retrieval_context=current_trace.retrieval_context,
|
|
1230
|
+
tools_called=current_trace.tools_called,
|
|
1231
|
+
expected_tools=current_trace.expected_tools,
|
|
1216
1232
|
additional_metadata=golden.additional_metadata,
|
|
1217
1233
|
comments=golden.comments,
|
|
1218
1234
|
name=golden.name,
|
|
@@ -1551,7 +1567,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1551
1567
|
tools_called=span.tools_called,
|
|
1552
1568
|
expected_tools=span.expected_tools,
|
|
1553
1569
|
)
|
|
1554
|
-
if span.metrics
|
|
1570
|
+
if span.metrics is None or llm_test_case is None:
|
|
1555
1571
|
return
|
|
1556
1572
|
|
|
1557
1573
|
has_task_completion = any(
|
|
@@ -1692,7 +1708,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1692
1708
|
with progress:
|
|
1693
1709
|
pbar_id = add_pbar(
|
|
1694
1710
|
progress,
|
|
1695
|
-
|
|
1711
|
+
"Running Component-Level Evals (sync)",
|
|
1696
1712
|
total=len(goldens) * 2,
|
|
1697
1713
|
)
|
|
1698
1714
|
yield from evaluate_test_cases(
|
|
@@ -1722,6 +1738,11 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1722
1738
|
_is_assert_test: bool = False,
|
|
1723
1739
|
) -> Iterator[TestResult]:
|
|
1724
1740
|
|
|
1741
|
+
GATHER_TIMEOUT_SECONDS = (
|
|
1742
|
+
settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1743
|
+
+ settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1725
1746
|
semaphore = asyncio.Semaphore(async_config.max_concurrent)
|
|
1726
1747
|
original_create_task = asyncio.create_task
|
|
1727
1748
|
|
|
@@ -1735,43 +1756,225 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1735
1756
|
|
|
1736
1757
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
1737
1758
|
async with semaphore:
|
|
1738
|
-
return await
|
|
1759
|
+
return await asyncio.wait_for(
|
|
1760
|
+
coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1761
|
+
)
|
|
1739
1762
|
|
|
1740
1763
|
def evaluate_test_cases(
|
|
1741
1764
|
progress: Optional[Progress] = None,
|
|
1742
1765
|
pbar_id: Optional[int] = None,
|
|
1743
1766
|
pbar_callback_id: Optional[int] = None,
|
|
1744
1767
|
):
|
|
1768
|
+
# Tasks we scheduled during this iterator run on this event loop.
|
|
1769
|
+
# by gathering these tasks we can avoid re-awaiting coroutines which
|
|
1770
|
+
# can cause cross loop mixups that trigger "future belongs to a different loop" errors
|
|
1771
|
+
created_tasks: list[asyncio.Task] = []
|
|
1772
|
+
task_meta: dict[asyncio.Task, dict] = {}
|
|
1773
|
+
current_golden_ctx = {"index": -1, "name": None, "input": None}
|
|
1774
|
+
|
|
1745
1775
|
def create_callback_task(coro, **kwargs):
|
|
1746
|
-
task
|
|
1776
|
+
# build a descriptive task name for tracking
|
|
1777
|
+
coro_desc = repr(coro)
|
|
1778
|
+
task_name = f"callback[{current_golden_ctx['index']}]:{coro_desc.split()[1] if ' ' in coro_desc else coro_desc}"
|
|
1779
|
+
|
|
1780
|
+
# Wrap the user coroutine in our semaphore runner and bind it to THIS loop.
|
|
1781
|
+
# Keep the resulting Task so we can gather tasks (not raw coroutines) later,
|
|
1782
|
+
# without touching tasks from other loops or already awaited coroutines.
|
|
1783
|
+
task = loop.create_task(
|
|
1784
|
+
execute_callback_with_semaphore(coro), name=task_name
|
|
1785
|
+
)
|
|
1786
|
+
|
|
1787
|
+
# record metadata for debugging
|
|
1788
|
+
MAX_META_INPUT_LENGTH = 120
|
|
1789
|
+
started = time.perf_counter()
|
|
1790
|
+
short_input = current_golden_ctx["input"]
|
|
1791
|
+
if (
|
|
1792
|
+
isinstance(short_input, str)
|
|
1793
|
+
and len(short_input) > MAX_META_INPUT_LENGTH
|
|
1794
|
+
):
|
|
1795
|
+
short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
|
|
1796
|
+
task_meta[task] = {
|
|
1797
|
+
"golden_index": current_golden_ctx["index"],
|
|
1798
|
+
"golden_name": current_golden_ctx["name"],
|
|
1799
|
+
"input": short_input,
|
|
1800
|
+
"coro": coro_desc,
|
|
1801
|
+
"started": started,
|
|
1802
|
+
}
|
|
1747
1803
|
|
|
1748
1804
|
def on_task_done(t: asyncio.Task):
|
|
1805
|
+
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1806
|
+
# Using info level here to make it easy to spot these logs.
|
|
1807
|
+
# We are gated by DEEPEVAL_DEBUG_ASYNC
|
|
1808
|
+
meta = task_meta.get(t, {})
|
|
1809
|
+
duration = time.perf_counter() - meta.get(
|
|
1810
|
+
"started", started
|
|
1811
|
+
)
|
|
1812
|
+
|
|
1813
|
+
if t.cancelled():
|
|
1814
|
+
logger.info(
|
|
1815
|
+
"[deepeval] task CANCELLED %s after %.2fs meta=%r",
|
|
1816
|
+
t.get_name(),
|
|
1817
|
+
duration,
|
|
1818
|
+
meta,
|
|
1819
|
+
)
|
|
1820
|
+
else:
|
|
1821
|
+
exc = t.exception()
|
|
1822
|
+
if exc is not None:
|
|
1823
|
+
logger.error(
|
|
1824
|
+
"[deepeval] task ERROR %s after %.2fs meta=%r",
|
|
1825
|
+
t.get_name(),
|
|
1826
|
+
duration,
|
|
1827
|
+
meta,
|
|
1828
|
+
exc_info=(type(exc), exc, exc.__traceback__),
|
|
1829
|
+
)
|
|
1830
|
+
else:
|
|
1831
|
+
logger.info(
|
|
1832
|
+
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
1833
|
+
t.get_name(),
|
|
1834
|
+
duration,
|
|
1835
|
+
meta.get("golden_index"),
|
|
1836
|
+
)
|
|
1837
|
+
|
|
1749
1838
|
update_pbar(progress, pbar_callback_id)
|
|
1750
1839
|
update_pbar(progress, pbar_id)
|
|
1751
1840
|
|
|
1752
1841
|
task.add_done_callback(on_task_done)
|
|
1842
|
+
created_tasks.append(task)
|
|
1753
1843
|
return task
|
|
1754
1844
|
|
|
1755
1845
|
asyncio.create_task = create_callback_task
|
|
1846
|
+
# DEBUG
|
|
1847
|
+
# Snapshot tasks that already exist on this loop so we can detect strays
|
|
1848
|
+
baseline_tasks = loop.run_until_complete(_snapshot_tasks())
|
|
1756
1849
|
|
|
1757
1850
|
try:
|
|
1758
|
-
for golden in goldens:
|
|
1851
|
+
for index, golden in enumerate(goldens):
|
|
1852
|
+
current_golden_ctx.update(
|
|
1853
|
+
{
|
|
1854
|
+
"index": index,
|
|
1855
|
+
"name": getattr(golden, "name", None),
|
|
1856
|
+
"input": getattr(golden, "input", None),
|
|
1857
|
+
}
|
|
1858
|
+
)
|
|
1859
|
+
prev_task_length = len(created_tasks)
|
|
1759
1860
|
yield golden
|
|
1760
|
-
if
|
|
1861
|
+
# if this golden created no tasks, bump bars now
|
|
1862
|
+
if len(created_tasks) == prev_task_length:
|
|
1761
1863
|
update_pbar(progress, pbar_callback_id)
|
|
1762
1864
|
update_pbar(progress, pbar_id)
|
|
1763
1865
|
finally:
|
|
1764
1866
|
asyncio.create_task = original_create_task
|
|
1765
1867
|
|
|
1766
|
-
if
|
|
1767
|
-
loop.
|
|
1768
|
-
|
|
1769
|
-
|
|
1868
|
+
if created_tasks:
|
|
1869
|
+
# Only await tasks we created on this loop in this run.
|
|
1870
|
+
# This will prevent re-awaiting and avoids cross loop "future belongs to a different loop" errors
|
|
1871
|
+
try:
|
|
1872
|
+
loop.run_until_complete(
|
|
1873
|
+
asyncio.wait_for(
|
|
1874
|
+
asyncio.gather(*created_tasks, return_exceptions=True),
|
|
1875
|
+
timeout=GATHER_TIMEOUT_SECONDS,
|
|
1876
|
+
)
|
|
1770
1877
|
)
|
|
1771
|
-
|
|
1878
|
+
except asyncio.TimeoutError:
|
|
1879
|
+
import traceback
|
|
1880
|
+
|
|
1881
|
+
pending = [t for t in created_tasks if not t.done()]
|
|
1882
|
+
|
|
1883
|
+
# Log the elapsed time for each task that was pending
|
|
1884
|
+
for t in pending:
|
|
1885
|
+
meta = task_meta.get(t, {})
|
|
1886
|
+
start_time = meta.get("started", time.perf_counter())
|
|
1887
|
+
elapsed_time = time.perf_counter() - start_time
|
|
1888
|
+
|
|
1889
|
+
# Determine if it was a per task or gather timeout based on task's elapsed time
|
|
1890
|
+
if (
|
|
1891
|
+
elapsed_time
|
|
1892
|
+
>= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1893
|
+
):
|
|
1894
|
+
timeout_type = "per-task"
|
|
1895
|
+
else:
|
|
1896
|
+
timeout_type = "gather"
|
|
1897
|
+
|
|
1898
|
+
logger.warning(
|
|
1899
|
+
f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
|
|
1900
|
+
f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
|
|
1901
|
+
f"To give tasks more time, consider increasing "
|
|
1902
|
+
f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
|
|
1903
|
+
f"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS to allow more time for gathering results."
|
|
1904
|
+
)
|
|
1905
|
+
|
|
1906
|
+
# Log pending tasks and their stack traces
|
|
1907
|
+
logger.info(
|
|
1908
|
+
" - PENDING %s elapsed_time=%.2fs meta=%s",
|
|
1909
|
+
t.get_name(),
|
|
1910
|
+
elapsed_time,
|
|
1911
|
+
meta,
|
|
1912
|
+
)
|
|
1913
|
+
if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1914
|
+
frames = t.get_stack(limit=6)
|
|
1915
|
+
if frames:
|
|
1916
|
+
logger.info(" stack:")
|
|
1917
|
+
for fr in frames:
|
|
1918
|
+
for line in traceback.format_stack(fr):
|
|
1919
|
+
logger.info(" " + line.rstrip())
|
|
1920
|
+
|
|
1921
|
+
# Cancel and drain the tasks
|
|
1922
|
+
for t in pending:
|
|
1923
|
+
t.cancel()
|
|
1924
|
+
loop.run_until_complete(
|
|
1925
|
+
asyncio.gather(*created_tasks, return_exceptions=True)
|
|
1926
|
+
)
|
|
1927
|
+
finally:
|
|
1928
|
+
|
|
1929
|
+
# if it is already closed, we are done
|
|
1930
|
+
if loop.is_closed():
|
|
1931
|
+
return
|
|
1932
|
+
|
|
1933
|
+
try:
|
|
1934
|
+
# Find tasks that were created during this run but we didn’t track
|
|
1935
|
+
current_tasks = loop.run_until_complete(_snapshot_tasks())
|
|
1936
|
+
except RuntimeError:
|
|
1937
|
+
# this might happen if the loop is already closing
|
|
1938
|
+
# nothing we can do
|
|
1939
|
+
return
|
|
1940
|
+
|
|
1941
|
+
leftovers = [
|
|
1942
|
+
t
|
|
1943
|
+
for t in current_tasks
|
|
1944
|
+
if t not in baseline_tasks
|
|
1945
|
+
and t not in created_tasks
|
|
1946
|
+
and not t.done()
|
|
1947
|
+
]
|
|
1948
|
+
|
|
1949
|
+
if not leftovers:
|
|
1950
|
+
return
|
|
1951
|
+
|
|
1952
|
+
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1953
|
+
logger.warning(
|
|
1954
|
+
"[deepeval] %d stray task(s) not tracked; cancelling…",
|
|
1955
|
+
len(leftovers),
|
|
1956
|
+
)
|
|
1957
|
+
for t in leftovers:
|
|
1958
|
+
meta = task_meta.get(t, {})
|
|
1959
|
+
name = t.get_name()
|
|
1960
|
+
logger.warning(" - STRAY %s meta=%s", name, meta)
|
|
1961
|
+
|
|
1962
|
+
for t in leftovers:
|
|
1963
|
+
t.cancel()
|
|
1964
|
+
|
|
1965
|
+
# Drain strays so they don’t leak into the next iteration
|
|
1966
|
+
try:
|
|
1967
|
+
loop.run_until_complete(
|
|
1968
|
+
asyncio.gather(*leftovers, return_exceptions=True)
|
|
1969
|
+
)
|
|
1970
|
+
except RuntimeError:
|
|
1971
|
+
# If the loop is closing here, just continue
|
|
1972
|
+
if settings.DEEPEVAL_DEBUG_ASYNC:
|
|
1973
|
+
logger.warning(
|
|
1974
|
+
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
1975
|
+
)
|
|
1772
1976
|
|
|
1773
1977
|
# Evaluate traces
|
|
1774
|
-
asyncio.create_task = loop.create_task
|
|
1775
1978
|
if trace_manager.traces_to_evaluate:
|
|
1776
1979
|
loop.run_until_complete(
|
|
1777
1980
|
_a_evaluate_traces(
|
|
@@ -1863,7 +2066,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1863
2066
|
with progress:
|
|
1864
2067
|
pbar_id = add_pbar(
|
|
1865
2068
|
progress,
|
|
1866
|
-
|
|
2069
|
+
"Running Component-Level Evals (async)",
|
|
1867
2070
|
total=len(goldens) * 2,
|
|
1868
2071
|
)
|
|
1869
2072
|
pbar_callback_id = add_pbar(
|
deepeval/openai_agents/agent.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field, replace
|
|
4
|
-
from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar
|
|
4
|
+
from typing import Any, Optional, Awaitable, Callable, Generic, TypeVar, List
|
|
5
5
|
|
|
6
6
|
from deepeval.tracing import observe
|
|
7
7
|
from deepeval.prompt import Prompt
|
|
8
|
+
from deepeval.tracing.tracing import Observer
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
11
|
+
from deepeval.tracing.types import LlmSpan
|
|
12
|
+
from deepeval.tracing.context import current_span_context
|
|
8
13
|
|
|
9
14
|
try:
|
|
10
15
|
from agents.agent import Agent as BaseAgent
|
|
11
16
|
from agents.models.interface import Model, ModelProvider
|
|
17
|
+
from openai.types.responses import ResponseCompletedEvent
|
|
12
18
|
except Exception as e:
|
|
13
19
|
raise RuntimeError(
|
|
14
20
|
"openai-agents is required for this integration. Please install it."
|
|
@@ -21,17 +27,15 @@ class _ObservedModel(Model):
|
|
|
21
27
|
def __init__(
|
|
22
28
|
self,
|
|
23
29
|
inner: Model,
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
deepeval_prompt: Optional[Any] = None,
|
|
30
|
+
llm_metric_collection: str = None,
|
|
31
|
+
llm_metrics: List[BaseMetric] = None,
|
|
32
|
+
confident_prompt: Prompt = None,
|
|
28
33
|
) -> None:
|
|
29
34
|
self._inner = inner
|
|
30
|
-
self.
|
|
31
|
-
self.
|
|
32
|
-
self.
|
|
35
|
+
self._llm_metric_collection = llm_metric_collection
|
|
36
|
+
self._llm_metrics = llm_metrics
|
|
37
|
+
self._confident_prompt = confident_prompt
|
|
33
38
|
|
|
34
|
-
# Delegate attributes not overridden
|
|
35
39
|
def __getattr__(self, name: str) -> Any:
|
|
36
40
|
return getattr(self._inner, name)
|
|
37
41
|
|
|
@@ -59,29 +63,48 @@ class _ObservedModel(Model):
|
|
|
59
63
|
previous_response_id,
|
|
60
64
|
conversation_id,
|
|
61
65
|
prompt,
|
|
66
|
+
**kwargs,
|
|
62
67
|
):
|
|
63
68
|
model_name = self._get_model_name()
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
69
|
+
with Observer(
|
|
70
|
+
span_type="llm",
|
|
71
|
+
func_name="LLM",
|
|
72
|
+
function_kwargs={
|
|
73
|
+
"system_instructions": system_instructions,
|
|
74
|
+
"input": input,
|
|
75
|
+
"model_settings": model_settings,
|
|
76
|
+
"tools": tools,
|
|
77
|
+
"output_schema": output_schema,
|
|
78
|
+
"handoffs": handoffs,
|
|
79
|
+
# "tracing": tracing, # not important for llm spans
|
|
80
|
+
# "previous_response_id": previous_response_id, # not important for llm spans
|
|
81
|
+
# "conversation_id": conversation_id, # not important for llm spans
|
|
82
|
+
"prompt": prompt,
|
|
83
|
+
**kwargs,
|
|
84
|
+
},
|
|
85
|
+
observe_kwargs={"model": model_name},
|
|
86
|
+
metrics=self._llm_metrics,
|
|
87
|
+
metric_collection=self._llm_metric_collection,
|
|
88
|
+
) as observer:
|
|
89
|
+
result = await self._inner.get_response(
|
|
90
|
+
system_instructions,
|
|
91
|
+
input,
|
|
92
|
+
model_settings,
|
|
93
|
+
tools,
|
|
94
|
+
output_schema,
|
|
95
|
+
handoffs,
|
|
96
|
+
tracing,
|
|
97
|
+
previous_response_id=previous_response_id,
|
|
98
|
+
conversation_id=conversation_id,
|
|
99
|
+
prompt=prompt,
|
|
100
|
+
**kwargs,
|
|
101
|
+
)
|
|
102
|
+
llm_span: LlmSpan = current_span_context.get()
|
|
103
|
+
llm_span.prompt = self._confident_prompt
|
|
104
|
+
|
|
105
|
+
observer.result = make_json_serializable(result.output)
|
|
106
|
+
|
|
107
|
+
return result
|
|
85
108
|
|
|
86
109
|
def stream_response(
|
|
87
110
|
self,
|
|
@@ -96,91 +119,77 @@ class _ObservedModel(Model):
|
|
|
96
119
|
previous_response_id,
|
|
97
120
|
conversation_id,
|
|
98
121
|
prompt,
|
|
122
|
+
**kwargs,
|
|
99
123
|
):
|
|
100
|
-
|
|
101
|
-
# wrapped = observe(
|
|
102
|
-
# metrics=self._metrics,
|
|
103
|
-
# metric_collection=self._metric_collection,
|
|
104
|
-
# type="llm",
|
|
105
|
-
# model=model_name,
|
|
106
|
-
# )(self._inner.stream_response)
|
|
107
|
-
# return wrapped(
|
|
108
|
-
# system_instructions,
|
|
109
|
-
# input,
|
|
110
|
-
# model_settings,
|
|
111
|
-
# tools,
|
|
112
|
-
# output_schema,
|
|
113
|
-
# handoffs,
|
|
114
|
-
# tracing,
|
|
115
|
-
# previous_response_id=previous_response_id,
|
|
116
|
-
# conversation_id=conversation_id,
|
|
117
|
-
# prompt=prompt,
|
|
118
|
-
# )
|
|
119
|
-
return self._inner.stream_response(
|
|
120
|
-
system_instructions,
|
|
121
|
-
input,
|
|
122
|
-
model_settings,
|
|
123
|
-
tools,
|
|
124
|
-
output_schema,
|
|
125
|
-
handoffs,
|
|
126
|
-
tracing,
|
|
127
|
-
previous_response_id=previous_response_id,
|
|
128
|
-
conversation_id=conversation_id,
|
|
129
|
-
prompt=prompt,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
class _ObservedProvider(ModelProvider):
|
|
134
|
-
def __init__(
|
|
135
|
-
self,
|
|
136
|
-
base: ModelProvider,
|
|
137
|
-
*,
|
|
138
|
-
metrics: Optional[list[Any]] = None,
|
|
139
|
-
metric_collection: Optional[str] = None,
|
|
140
|
-
deepeval_prompt: Optional[Any] = None,
|
|
141
|
-
) -> None:
|
|
142
|
-
self._base = base
|
|
143
|
-
self._metrics = metrics
|
|
144
|
-
self._metric_collection = metric_collection
|
|
145
|
-
self._deepeval_prompt = deepeval_prompt
|
|
124
|
+
model_name = self._get_model_name()
|
|
146
125
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
126
|
+
async def _gen():
|
|
127
|
+
observer = Observer(
|
|
128
|
+
span_type="llm",
|
|
129
|
+
func_name="LLM",
|
|
130
|
+
function_kwargs={
|
|
131
|
+
"system_instructions": system_instructions,
|
|
132
|
+
"input": input,
|
|
133
|
+
"model_settings": model_settings,
|
|
134
|
+
"tools": tools,
|
|
135
|
+
"output_schema": output_schema,
|
|
136
|
+
"handoffs": handoffs,
|
|
137
|
+
# "tracing": tracing,
|
|
138
|
+
# "previous_response_id": previous_response_id,
|
|
139
|
+
# "conversation_id": conversation_id,
|
|
140
|
+
"prompt": prompt,
|
|
141
|
+
**kwargs,
|
|
142
|
+
},
|
|
143
|
+
observe_kwargs={"model": model_name},
|
|
144
|
+
metrics=self._llm_metrics,
|
|
145
|
+
metric_collection=self._llm_metric_collection,
|
|
146
|
+
)
|
|
147
|
+
observer.__enter__()
|
|
148
|
+
|
|
149
|
+
llm_span: LlmSpan = current_span_context.get()
|
|
150
|
+
llm_span.prompt = self._confident_prompt
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
async for event in self._inner.stream_response(
|
|
154
|
+
system_instructions,
|
|
155
|
+
input,
|
|
156
|
+
model_settings,
|
|
157
|
+
tools,
|
|
158
|
+
output_schema,
|
|
159
|
+
handoffs,
|
|
160
|
+
tracing,
|
|
161
|
+
previous_response_id=previous_response_id,
|
|
162
|
+
conversation_id=conversation_id,
|
|
163
|
+
prompt=prompt,
|
|
164
|
+
):
|
|
165
|
+
|
|
166
|
+
if isinstance(event, ResponseCompletedEvent):
|
|
167
|
+
observer.result = (
|
|
168
|
+
event.response.output_text
|
|
169
|
+
) # TODO: support other response types
|
|
170
|
+
|
|
171
|
+
yield event
|
|
172
|
+
|
|
173
|
+
observer.__exit__(None, None, None)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
observer.__exit__(type(e), e, e.__traceback__)
|
|
176
|
+
raise
|
|
177
|
+
finally:
|
|
178
|
+
|
|
179
|
+
observer.__exit__(None, None, None)
|
|
180
|
+
|
|
181
|
+
return _gen()
|
|
155
182
|
|
|
156
183
|
|
|
157
184
|
@dataclass
|
|
158
185
|
class DeepEvalAgent(BaseAgent[TContext], Generic[TContext]):
|
|
159
186
|
"""
|
|
160
|
-
A subclass of agents.Agent
|
|
161
|
-
and ensures the underlying model's `get_response` is wrapped with deepeval.observe.
|
|
187
|
+
A subclass of agents.Agent.
|
|
162
188
|
"""
|
|
163
189
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
190
|
+
llm_metric_collection: str = None
|
|
191
|
+
llm_metrics: List[BaseMetric] = None
|
|
192
|
+
confident_prompt: Prompt = None
|
|
167
193
|
|
|
168
194
|
def __post_init__(self):
|
|
169
195
|
super().__post_init__()
|
|
170
|
-
# If a direct Model instance is set on the agent, wrap it here.
|
|
171
|
-
if self.model is not None and not isinstance(self.model, str):
|
|
172
|
-
try:
|
|
173
|
-
from agents.models.interface import (
|
|
174
|
-
Model as _Model,
|
|
175
|
-
) # local import for safety
|
|
176
|
-
|
|
177
|
-
if isinstance(self.model, _Model):
|
|
178
|
-
self.model = _ObservedModel(
|
|
179
|
-
self.model,
|
|
180
|
-
metrics=self.metrics,
|
|
181
|
-
metric_collection=self.metric_collection,
|
|
182
|
-
deepeval_prompt=self.deepeval_prompt,
|
|
183
|
-
)
|
|
184
|
-
except Exception:
|
|
185
|
-
# If we can't import or wrap, silently skip.
|
|
186
|
-
pass
|