deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +725 -217
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -43,15 +43,19 @@ from deepeval.tracing.api import (
|
|
|
43
43
|
)
|
|
44
44
|
from deepeval.dataset import Golden
|
|
45
45
|
from deepeval.contextvars import set_current_golden, reset_current_golden
|
|
46
|
-
from deepeval.errors import MissingTestCaseParamsError
|
|
46
|
+
from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
|
|
47
47
|
from deepeval.metrics.utils import copy_metrics
|
|
48
|
-
from deepeval.utils import
|
|
48
|
+
from deepeval.utils import (
|
|
49
|
+
get_or_create_event_loop,
|
|
50
|
+
shorten,
|
|
51
|
+
len_medium,
|
|
52
|
+
format_error_text,
|
|
53
|
+
)
|
|
49
54
|
from deepeval.telemetry import capture_evaluation_run
|
|
50
55
|
from deepeval.metrics import (
|
|
51
56
|
BaseMetric,
|
|
52
57
|
BaseConversationalMetric,
|
|
53
58
|
BaseMultimodalMetric,
|
|
54
|
-
TaskCompletionMetric,
|
|
55
59
|
)
|
|
56
60
|
from deepeval.metrics.indicator import (
|
|
57
61
|
measure_metrics_with_indicator,
|
|
@@ -82,10 +86,13 @@ from deepeval.evaluate.utils import (
|
|
|
82
86
|
create_metric_data,
|
|
83
87
|
create_test_result,
|
|
84
88
|
count_metrics_in_trace,
|
|
89
|
+
count_total_metrics_for_trace,
|
|
90
|
+
count_metrics_in_span_subtree,
|
|
85
91
|
extract_trace_test_results,
|
|
86
92
|
)
|
|
87
93
|
from deepeval.utils import add_pbar, update_pbar, custom_console
|
|
88
|
-
from deepeval.tracing.types import TestCaseMetricPair
|
|
94
|
+
from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
|
|
95
|
+
from deepeval.tracing.api import TraceSpanApiStatus
|
|
89
96
|
from deepeval.config.settings import get_settings
|
|
90
97
|
from deepeval.test_run import TEMP_FILE_PATH
|
|
91
98
|
from deepeval.confident.api import is_confident
|
|
@@ -97,6 +104,108 @@ from deepeval.test_run.hyperparameters import (
|
|
|
97
104
|
logger = logging.getLogger(__name__)
|
|
98
105
|
|
|
99
106
|
|
|
107
|
+
def _skip_metrics_for_error(
|
|
108
|
+
span: Optional[BaseSpan] = None,
|
|
109
|
+
trace: Optional[Trace] = None,
|
|
110
|
+
) -> bool:
|
|
111
|
+
# trace failure: skip everything under this trace
|
|
112
|
+
if trace is not None and trace.status == TraceSpanStatus.ERRORED:
|
|
113
|
+
return True
|
|
114
|
+
# span failure: skip this span’s metrics
|
|
115
|
+
if span is not None and span.status == TraceSpanStatus.ERRORED:
|
|
116
|
+
return True
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _trace_error(current_trace: Trace) -> Optional[str]:
|
|
121
|
+
def _first_err(s: BaseSpan) -> Optional[str]:
|
|
122
|
+
if s.status == TraceSpanStatus.ERRORED and s.error:
|
|
123
|
+
return s.error
|
|
124
|
+
for c in s.children or []:
|
|
125
|
+
e = _first_err(c)
|
|
126
|
+
if e:
|
|
127
|
+
return e
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
for root in current_trace.root_spans or []:
|
|
131
|
+
e = _first_err(root)
|
|
132
|
+
if e:
|
|
133
|
+
return e
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _get_trace_by_uuid_anywhere(trace_uuid: str):
|
|
138
|
+
"""
|
|
139
|
+
Resolver for a trace UUID across the manager's state.
|
|
140
|
+
|
|
141
|
+
First tries the manager's indexed lookup, which (covers active/in-flight traces,
|
|
142
|
+
then does a linear scan of the full `trace_manager.traces` list, which covers
|
|
143
|
+
traces that were recorded/closed earlier or not yet indexed. Returns
|
|
144
|
+
the concrete Trace object or None if not found.
|
|
145
|
+
"""
|
|
146
|
+
tr = trace_manager.get_trace_by_uuid(trace_uuid)
|
|
147
|
+
if tr:
|
|
148
|
+
return tr
|
|
149
|
+
for tr in trace_manager.traces:
|
|
150
|
+
if tr.uuid == trace_uuid:
|
|
151
|
+
return tr
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _pick_root_for_marking(trace):
|
|
156
|
+
"""
|
|
157
|
+
Choose the most appropriate root span to annotate on error/cancel.
|
|
158
|
+
|
|
159
|
+
Heuristic:
|
|
160
|
+
- Prefer the most recent open root, which will have no `end_time` since this is the
|
|
161
|
+
span currently in flight.
|
|
162
|
+
- If none are open, use the last root span if it exists.
|
|
163
|
+
- If the trace has no roots, return None.
|
|
164
|
+
|
|
165
|
+
This favors marking the active root in multi root traces while remaining
|
|
166
|
+
stable for already closed traces.
|
|
167
|
+
"""
|
|
168
|
+
open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
|
|
169
|
+
return (
|
|
170
|
+
open_roots[-1]
|
|
171
|
+
if open_roots
|
|
172
|
+
else (trace.root_spans[-1] if trace.root_spans else None)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _resolve_trace_and_root_for_task(t: asyncio.Task):
|
|
177
|
+
"""
|
|
178
|
+
Resolve trace and root for a completed task using the weak binding map.
|
|
179
|
+
|
|
180
|
+
Steps:
|
|
181
|
+
1. Look up the task in `trace_manager.task_bindings` to get the
|
|
182
|
+
bound `trace_uuid` and, if available, `root_span_uuid`.
|
|
183
|
+
2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
|
|
184
|
+
3. If a bound root UUID exists, try to find that exact root on the trace.
|
|
185
|
+
4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
|
|
186
|
+
|
|
187
|
+
Returns a trace / root tuple. Either may be `None` when no binding is
|
|
188
|
+
present. This function is used by `on_task_done` to robustly mark error/cancel
|
|
189
|
+
states without assuming a single root trace or a root that is still open.
|
|
190
|
+
"""
|
|
191
|
+
binding = trace_manager.task_bindings.get(t) or {}
|
|
192
|
+
trace_uuid = binding.get("trace_uuid")
|
|
193
|
+
root_span_uuid = binding.get("root_span_uuid")
|
|
194
|
+
|
|
195
|
+
trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
|
|
196
|
+
root = None
|
|
197
|
+
|
|
198
|
+
if trace and root_span_uuid:
|
|
199
|
+
root = next(
|
|
200
|
+
(rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if trace and root is None:
|
|
204
|
+
root = _pick_root_for_marking(trace)
|
|
205
|
+
|
|
206
|
+
return trace, root
|
|
207
|
+
|
|
208
|
+
|
|
100
209
|
async def _snapshot_tasks():
|
|
101
210
|
cur = asyncio.current_task()
|
|
102
211
|
# `all_tasks` returns tasks for the current running loop only
|
|
@@ -115,6 +224,20 @@ def _gather_timeout() -> float:
|
|
|
115
224
|
)
|
|
116
225
|
|
|
117
226
|
|
|
227
|
+
def filter_duplicate_results(
|
|
228
|
+
main_result: TestResult, results: List[TestResult]
|
|
229
|
+
) -> List[TestResult]:
|
|
230
|
+
return [
|
|
231
|
+
result
|
|
232
|
+
for result in results
|
|
233
|
+
if not (
|
|
234
|
+
(result.input == main_result.input)
|
|
235
|
+
and (result.actual_output == main_result.actual_output)
|
|
236
|
+
and (result.metrics_data == main_result.metrics_data)
|
|
237
|
+
)
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
|
|
118
241
|
###########################################
|
|
119
242
|
### E2E Evals #############################
|
|
120
243
|
###########################################
|
|
@@ -380,7 +503,10 @@ async def a_execute_test_cases(
|
|
|
380
503
|
|
|
381
504
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
382
505
|
async with semaphore:
|
|
383
|
-
return await
|
|
506
|
+
return await asyncio.wait_for(
|
|
507
|
+
func(*args, **kwargs),
|
|
508
|
+
timeout=_per_task_timeout(),
|
|
509
|
+
)
|
|
384
510
|
|
|
385
511
|
global_test_run_cache_manager.disable_write_cache = (
|
|
386
512
|
cache_config.write_cache is False
|
|
@@ -499,7 +625,20 @@ async def a_execute_test_cases(
|
|
|
499
625
|
tasks.append(asyncio.create_task(task))
|
|
500
626
|
|
|
501
627
|
await asyncio.sleep(async_config.throttle_value)
|
|
502
|
-
|
|
628
|
+
|
|
629
|
+
try:
|
|
630
|
+
await asyncio.wait_for(
|
|
631
|
+
asyncio.gather(*tasks),
|
|
632
|
+
timeout=_gather_timeout(),
|
|
633
|
+
)
|
|
634
|
+
except asyncio.TimeoutError:
|
|
635
|
+
# Cancel any still-pending tasks and drain them
|
|
636
|
+
for t in tasks:
|
|
637
|
+
if not t.done():
|
|
638
|
+
t.cancel()
|
|
639
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
640
|
+
raise
|
|
641
|
+
|
|
503
642
|
else:
|
|
504
643
|
for test_case in test_cases:
|
|
505
644
|
with capture_evaluation_run("test case"):
|
|
@@ -572,7 +711,19 @@ async def a_execute_test_cases(
|
|
|
572
711
|
tasks.append(asyncio.create_task(task))
|
|
573
712
|
|
|
574
713
|
await asyncio.sleep(async_config.throttle_value)
|
|
575
|
-
|
|
714
|
+
|
|
715
|
+
try:
|
|
716
|
+
await asyncio.wait_for(
|
|
717
|
+
asyncio.gather(*tasks),
|
|
718
|
+
timeout=_gather_timeout(),
|
|
719
|
+
)
|
|
720
|
+
except asyncio.TimeoutError:
|
|
721
|
+
# Cancel any still-pending tasks and drain them
|
|
722
|
+
for t in tasks:
|
|
723
|
+
if not t.done():
|
|
724
|
+
t.cancel()
|
|
725
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
726
|
+
raise
|
|
576
727
|
|
|
577
728
|
return test_results
|
|
578
729
|
|
|
@@ -847,6 +998,7 @@ def execute_agentic_test_cases(
|
|
|
847
998
|
_progress=progress,
|
|
848
999
|
_pbar_callback_id=pbar_tags_id,
|
|
849
1000
|
):
|
|
1001
|
+
|
|
850
1002
|
if asyncio.iscoroutinefunction(observed_callback):
|
|
851
1003
|
loop = get_or_create_event_loop()
|
|
852
1004
|
coro = observed_callback(golden.input)
|
|
@@ -898,10 +1050,11 @@ def execute_agentic_test_cases(
|
|
|
898
1050
|
pbar_eval_id: Optional[int] = None,
|
|
899
1051
|
):
|
|
900
1052
|
# Create API Span
|
|
901
|
-
metrics: List[BaseMetric] = span.metrics
|
|
1053
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
902
1054
|
api_span: BaseApiSpan = (
|
|
903
1055
|
trace_manager._convert_span_to_api_span(span)
|
|
904
1056
|
)
|
|
1057
|
+
|
|
905
1058
|
if isinstance(span, AgentSpan):
|
|
906
1059
|
trace_api.agent_spans.append(api_span)
|
|
907
1060
|
elif isinstance(span, LlmSpan):
|
|
@@ -914,14 +1067,27 @@ def execute_agentic_test_cases(
|
|
|
914
1067
|
else:
|
|
915
1068
|
trace_api.base_spans.append(api_span)
|
|
916
1069
|
|
|
1070
|
+
# Skip errored trace/span
|
|
1071
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
1072
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1073
|
+
api_span.error = span.error or _trace_error(
|
|
1074
|
+
current_trace
|
|
1075
|
+
)
|
|
1076
|
+
if progress and pbar_eval_id is not None:
|
|
1077
|
+
update_pbar(
|
|
1078
|
+
progress,
|
|
1079
|
+
pbar_eval_id,
|
|
1080
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1081
|
+
)
|
|
1082
|
+
return
|
|
1083
|
+
|
|
917
1084
|
for child in span.children:
|
|
918
1085
|
dfs(child, progress, pbar_eval_id)
|
|
919
1086
|
|
|
920
|
-
if span.metrics
|
|
1087
|
+
if not span.metrics:
|
|
921
1088
|
return
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
for metric in span.metrics
|
|
1089
|
+
requires_trace = any(
|
|
1090
|
+
metric.requires_trace for metric in span.metrics
|
|
925
1091
|
)
|
|
926
1092
|
|
|
927
1093
|
llm_test_case = None
|
|
@@ -939,18 +1105,30 @@ def execute_agentic_test_cases(
|
|
|
939
1105
|
tools_called=span.tools_called,
|
|
940
1106
|
expected_tools=span.expected_tools,
|
|
941
1107
|
)
|
|
942
|
-
if llm_test_case is None and not has_task_completion:
|
|
943
|
-
raise ValueError(
|
|
944
|
-
"Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
|
|
945
|
-
)
|
|
946
1108
|
|
|
947
1109
|
# add trace if task completion
|
|
948
|
-
if
|
|
1110
|
+
if requires_trace:
|
|
949
1111
|
if llm_test_case is None:
|
|
950
1112
|
llm_test_case = LLMTestCase(input="None")
|
|
951
1113
|
llm_test_case._trace_dict = (
|
|
952
1114
|
trace_manager.create_nested_spans_dict(span)
|
|
953
1115
|
)
|
|
1116
|
+
else:
|
|
1117
|
+
if llm_test_case is None:
|
|
1118
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1119
|
+
api_span.error = format_error_text(
|
|
1120
|
+
DeepEvalError(
|
|
1121
|
+
"Span has metrics but no LLMTestCase. "
|
|
1122
|
+
"Are you sure you called `update_current_span()`?"
|
|
1123
|
+
)
|
|
1124
|
+
)
|
|
1125
|
+
if progress and pbar_eval_id is not None:
|
|
1126
|
+
update_pbar(
|
|
1127
|
+
progress,
|
|
1128
|
+
pbar_eval_id,
|
|
1129
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1130
|
+
)
|
|
1131
|
+
return
|
|
954
1132
|
|
|
955
1133
|
# Preparing metric calculation
|
|
956
1134
|
api_span.metrics_data = []
|
|
@@ -989,72 +1167,111 @@ def execute_agentic_test_cases(
|
|
|
989
1167
|
|
|
990
1168
|
start_time = time.perf_counter()
|
|
991
1169
|
|
|
1170
|
+
skip_metrics_for_this_golden = False
|
|
992
1171
|
# Handle trace-level metrics
|
|
993
|
-
if current_trace
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
llm_test_case = LLMTestCase(
|
|
1002
|
-
input=str(current_trace.input),
|
|
1003
|
-
actual_output=(
|
|
1004
|
-
str(current_trace.output)
|
|
1005
|
-
if current_trace.output is not None
|
|
1006
|
-
else None
|
|
1172
|
+
if _skip_metrics_for_error(trace=current_trace):
|
|
1173
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1174
|
+
if progress and pbar_eval_id is not None:
|
|
1175
|
+
update_pbar(
|
|
1176
|
+
progress,
|
|
1177
|
+
pbar_eval_id,
|
|
1178
|
+
advance=count_total_metrics_for_trace(
|
|
1179
|
+
current_trace
|
|
1007
1180
|
),
|
|
1008
|
-
expected_output=current_trace.expected_output,
|
|
1009
|
-
context=current_trace.context,
|
|
1010
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1011
|
-
tools_called=current_trace.tools_called,
|
|
1012
|
-
expected_tools=current_trace.expected_tools,
|
|
1013
1181
|
)
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1182
|
+
else:
|
|
1183
|
+
if current_trace.metrics:
|
|
1184
|
+
requires_trace = any(
|
|
1185
|
+
metric.requires_trace
|
|
1186
|
+
for metric in current_trace.metrics
|
|
1017
1187
|
)
|
|
1018
1188
|
|
|
1019
|
-
|
|
1020
|
-
if
|
|
1021
|
-
llm_test_case = LLMTestCase(
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1189
|
+
llm_test_case = None
|
|
1190
|
+
if current_trace.input:
|
|
1191
|
+
llm_test_case = LLMTestCase(
|
|
1192
|
+
input=str(current_trace.input),
|
|
1193
|
+
actual_output=(
|
|
1194
|
+
str(current_trace.output)
|
|
1195
|
+
if current_trace.output is not None
|
|
1196
|
+
else None
|
|
1197
|
+
),
|
|
1198
|
+
expected_output=current_trace.expected_output,
|
|
1199
|
+
context=current_trace.context,
|
|
1200
|
+
retrieval_context=current_trace.retrieval_context,
|
|
1201
|
+
tools_called=current_trace.tools_called,
|
|
1202
|
+
expected_tools=current_trace.expected_tools,
|
|
1025
1203
|
)
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1204
|
+
if requires_trace:
|
|
1205
|
+
if llm_test_case is None:
|
|
1206
|
+
llm_test_case = LLMTestCase(input="None")
|
|
1207
|
+
llm_test_case._trace_dict = (
|
|
1208
|
+
trace_manager.create_nested_spans_dict(
|
|
1209
|
+
current_trace.root_spans[0]
|
|
1210
|
+
)
|
|
1211
|
+
)
|
|
1212
|
+
else:
|
|
1213
|
+
if llm_test_case is None:
|
|
1214
|
+
current_trace.status = TraceSpanStatus.ERRORED
|
|
1215
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1216
|
+
if current_trace.root_spans:
|
|
1217
|
+
current_trace.root_spans[0].status = (
|
|
1218
|
+
TraceSpanStatus.ERRORED
|
|
1219
|
+
)
|
|
1220
|
+
current_trace.root_spans[0].error = (
|
|
1221
|
+
format_error_text(
|
|
1222
|
+
DeepEvalError(
|
|
1223
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
1224
|
+
"Are you sure you called `update_current_trace()`?"
|
|
1225
|
+
)
|
|
1226
|
+
)
|
|
1227
|
+
)
|
|
1228
|
+
if progress and pbar_eval_id is not None:
|
|
1229
|
+
update_pbar(
|
|
1230
|
+
progress,
|
|
1231
|
+
pbar_eval_id,
|
|
1232
|
+
advance=count_total_metrics_for_trace(
|
|
1233
|
+
current_trace
|
|
1234
|
+
),
|
|
1235
|
+
)
|
|
1236
|
+
skip_metrics_for_this_golden = True
|
|
1237
|
+
|
|
1238
|
+
if not skip_metrics_for_this_golden:
|
|
1239
|
+
for metric in current_trace.metrics:
|
|
1240
|
+
metric.skipped = False
|
|
1241
|
+
metric.error = None
|
|
1242
|
+
if display_config.verbose_mode is not None:
|
|
1243
|
+
metric.verbose_mode = (
|
|
1244
|
+
display_config.verbose_mode
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
trace_api.metrics_data = []
|
|
1248
|
+
for metric in current_trace.metrics:
|
|
1249
|
+
res = _execute_metric(
|
|
1250
|
+
metric=metric,
|
|
1251
|
+
test_case=llm_test_case,
|
|
1252
|
+
show_metric_indicator=show_metric_indicator,
|
|
1253
|
+
in_component=True,
|
|
1254
|
+
error_config=error_config,
|
|
1255
|
+
)
|
|
1256
|
+
if res == "skip":
|
|
1257
|
+
continue
|
|
1258
|
+
|
|
1259
|
+
if not metric.skipped:
|
|
1260
|
+
metric_data = create_metric_data(metric)
|
|
1261
|
+
trace_api.metrics_data.append(metric_data)
|
|
1262
|
+
api_test_case.update_metric_data(
|
|
1263
|
+
metric_data
|
|
1264
|
+
)
|
|
1265
|
+
api_test_case.update_status(
|
|
1266
|
+
metric_data.success
|
|
1267
|
+
)
|
|
1268
|
+
update_pbar(progress, pbar_eval_id)
|
|
1269
|
+
|
|
1270
|
+
# Then handle span-level metrics
|
|
1271
|
+
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
1052
1272
|
|
|
1053
|
-
# Then handle span-level metrics
|
|
1054
|
-
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
1055
1273
|
end_time = time.perf_counter()
|
|
1056
1274
|
run_duration = end_time - start_time
|
|
1057
|
-
|
|
1058
1275
|
# Update test run
|
|
1059
1276
|
api_test_case.update_run_duration(run_duration)
|
|
1060
1277
|
test_run_manager.update_test_run(api_test_case, test_case)
|
|
@@ -1102,7 +1319,10 @@ async def a_execute_agentic_test_cases(
|
|
|
1102
1319
|
|
|
1103
1320
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
1104
1321
|
async with semaphore:
|
|
1105
|
-
return await
|
|
1322
|
+
return await asyncio.wait_for(
|
|
1323
|
+
func(*args, **kwargs),
|
|
1324
|
+
timeout=_per_task_timeout(),
|
|
1325
|
+
)
|
|
1106
1326
|
|
|
1107
1327
|
test_run_manager = global_test_run_manager
|
|
1108
1328
|
test_run_manager.save_to_disk = cache_config.write_cache
|
|
@@ -1149,7 +1369,19 @@ async def a_execute_agentic_test_cases(
|
|
|
1149
1369
|
tasks.append(asyncio.create_task(task))
|
|
1150
1370
|
await asyncio.sleep(async_config.throttle_value)
|
|
1151
1371
|
|
|
1152
|
-
|
|
1372
|
+
try:
|
|
1373
|
+
await asyncio.wait_for(
|
|
1374
|
+
asyncio.gather(*tasks),
|
|
1375
|
+
timeout=_gather_timeout(),
|
|
1376
|
+
)
|
|
1377
|
+
except asyncio.TimeoutError:
|
|
1378
|
+
# Cancel any still-pending tasks and drain them
|
|
1379
|
+
for t in tasks:
|
|
1380
|
+
if not t.done():
|
|
1381
|
+
t.cancel()
|
|
1382
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
1383
|
+
raise
|
|
1384
|
+
|
|
1153
1385
|
else:
|
|
1154
1386
|
for golden in goldens:
|
|
1155
1387
|
with capture_evaluation_run("golden"):
|
|
@@ -1266,7 +1498,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1266
1498
|
)
|
|
1267
1499
|
|
|
1268
1500
|
await _a_execute_trace_test_case(
|
|
1269
|
-
trace=
|
|
1501
|
+
trace=current_trace,
|
|
1270
1502
|
trace_api=trace_api,
|
|
1271
1503
|
api_test_case=api_test_case,
|
|
1272
1504
|
ignore_errors=ignore_errors,
|
|
@@ -1278,9 +1510,10 @@ async def _a_execute_agentic_test_case(
|
|
|
1278
1510
|
_use_bar_indicator=_use_bar_indicator,
|
|
1279
1511
|
)
|
|
1280
1512
|
|
|
1281
|
-
async def dfs(span: BaseSpan):
|
|
1513
|
+
async def dfs(trace: Trace, span: BaseSpan):
|
|
1282
1514
|
await _a_execute_span_test_case(
|
|
1283
1515
|
span=span,
|
|
1516
|
+
current_trace=trace,
|
|
1284
1517
|
trace_api=trace_api,
|
|
1285
1518
|
api_test_case=api_test_case,
|
|
1286
1519
|
ignore_errors=ignore_errors,
|
|
@@ -1292,36 +1525,58 @@ async def _a_execute_agentic_test_case(
|
|
|
1292
1525
|
test_run_manager=test_run_manager,
|
|
1293
1526
|
_use_bar_indicator=_use_bar_indicator,
|
|
1294
1527
|
)
|
|
1295
|
-
|
|
1528
|
+
|
|
1529
|
+
if _skip_metrics_for_error(span=span, trace=trace):
|
|
1530
|
+
return
|
|
1531
|
+
|
|
1532
|
+
child_tasks = [
|
|
1533
|
+
asyncio.create_task(dfs(trace, child)) for child in span.children
|
|
1534
|
+
]
|
|
1296
1535
|
if child_tasks:
|
|
1297
|
-
|
|
1536
|
+
try:
|
|
1537
|
+
await asyncio.wait_for(
|
|
1538
|
+
asyncio.gather(*child_tasks),
|
|
1539
|
+
timeout=_gather_timeout(),
|
|
1540
|
+
)
|
|
1541
|
+
except asyncio.TimeoutError:
|
|
1542
|
+
for t in child_tasks:
|
|
1543
|
+
if not t.done():
|
|
1544
|
+
t.cancel()
|
|
1545
|
+
await asyncio.gather(*child_tasks, return_exceptions=True)
|
|
1546
|
+
raise
|
|
1298
1547
|
|
|
1299
1548
|
test_start_time = time.perf_counter()
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1549
|
+
|
|
1550
|
+
if not _skip_metrics_for_error(trace=current_trace):
|
|
1551
|
+
if current_trace and current_trace.root_spans:
|
|
1552
|
+
await dfs(current_trace, current_trace.root_spans[0])
|
|
1553
|
+
else:
|
|
1554
|
+
if (
|
|
1555
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
1556
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
1557
|
+
):
|
|
1558
|
+
logger.debug(
|
|
1559
|
+
"Skipping DFS: empty trace or no root spans (trace=%s)",
|
|
1560
|
+
current_trace.uuid if current_trace else None,
|
|
1561
|
+
)
|
|
1311
1562
|
|
|
1312
1563
|
test_end_time = time.perf_counter()
|
|
1313
1564
|
run_duration = test_end_time - test_start_time
|
|
1314
1565
|
|
|
1315
1566
|
api_test_case.update_run_duration(run_duration)
|
|
1316
1567
|
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1317
|
-
|
|
1318
|
-
|
|
1568
|
+
main_result = create_test_result(api_test_case)
|
|
1569
|
+
trace_results = extract_trace_test_results(trace_api)
|
|
1570
|
+
unique_trace_results = filter_duplicate_results(main_result, trace_results)
|
|
1571
|
+
test_results.append(main_result)
|
|
1572
|
+
test_results.extend(unique_trace_results)
|
|
1319
1573
|
|
|
1320
1574
|
update_pbar(progress, pbar_id)
|
|
1321
1575
|
|
|
1322
1576
|
|
|
1323
1577
|
async def _a_execute_span_test_case(
|
|
1324
1578
|
span: BaseSpan,
|
|
1579
|
+
current_trace: Trace,
|
|
1325
1580
|
trace_api: TraceApi,
|
|
1326
1581
|
api_test_case: LLMApiTestCase,
|
|
1327
1582
|
ignore_errors: bool,
|
|
@@ -1346,12 +1601,22 @@ async def _a_execute_span_test_case(
|
|
|
1346
1601
|
else:
|
|
1347
1602
|
trace_api.base_spans.append(api_span)
|
|
1348
1603
|
|
|
1349
|
-
if span
|
|
1604
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
1605
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1606
|
+
api_span.error = span.error or _trace_error(current_trace)
|
|
1607
|
+
if progress and pbar_eval_id is not None:
|
|
1608
|
+
update_pbar(
|
|
1609
|
+
progress,
|
|
1610
|
+
pbar_eval_id,
|
|
1611
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1612
|
+
)
|
|
1350
1613
|
return
|
|
1351
1614
|
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1615
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
1616
|
+
if not metrics:
|
|
1617
|
+
return
|
|
1618
|
+
|
|
1619
|
+
requires_trace = any(metric.requires_trace for metric in metrics)
|
|
1355
1620
|
|
|
1356
1621
|
llm_test_case = None
|
|
1357
1622
|
if span.input:
|
|
@@ -1364,17 +1629,29 @@ async def _a_execute_span_test_case(
|
|
|
1364
1629
|
tools_called=span.tools_called,
|
|
1365
1630
|
expected_tools=span.expected_tools,
|
|
1366
1631
|
)
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1632
|
+
|
|
1633
|
+
if not requires_trace:
|
|
1634
|
+
if llm_test_case is None:
|
|
1635
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1636
|
+
api_span.error = format_error_text(
|
|
1637
|
+
DeepEvalError(
|
|
1638
|
+
"Span has metrics but no LLMTestCase. "
|
|
1639
|
+
"Are you sure you called `update_current_span()`?"
|
|
1640
|
+
)
|
|
1641
|
+
)
|
|
1642
|
+
if progress and pbar_eval_id is not None:
|
|
1643
|
+
update_pbar(
|
|
1644
|
+
progress,
|
|
1645
|
+
pbar_eval_id,
|
|
1646
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1647
|
+
)
|
|
1648
|
+
return
|
|
1371
1649
|
|
|
1372
1650
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
1373
|
-
metrics: List[BaseMetric] = span.metrics
|
|
1374
1651
|
test_case: Optional[LLMTestCase] = llm_test_case
|
|
1375
1652
|
|
|
1376
1653
|
# add trace if task completion
|
|
1377
|
-
if
|
|
1654
|
+
if requires_trace:
|
|
1378
1655
|
if test_case is None:
|
|
1379
1656
|
test_case = LLMTestCase(input="None")
|
|
1380
1657
|
test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
|
|
@@ -1418,12 +1695,22 @@ async def _a_execute_trace_test_case(
|
|
|
1418
1695
|
pbar_eval_id: Optional[int],
|
|
1419
1696
|
_use_bar_indicator: bool,
|
|
1420
1697
|
):
|
|
1421
|
-
|
|
1698
|
+
|
|
1699
|
+
if _skip_metrics_for_error(trace=trace):
|
|
1700
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1701
|
+
if progress and pbar_eval_id is not None:
|
|
1702
|
+
update_pbar(
|
|
1703
|
+
progress,
|
|
1704
|
+
pbar_eval_id,
|
|
1705
|
+
advance=count_total_metrics_for_trace(trace),
|
|
1706
|
+
)
|
|
1422
1707
|
return
|
|
1423
1708
|
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1709
|
+
metrics: List[BaseMetric] = list(trace.metrics or [])
|
|
1710
|
+
if not metrics:
|
|
1711
|
+
return
|
|
1712
|
+
|
|
1713
|
+
requires_trace = any(metric.requires_trace for metric in metrics)
|
|
1427
1714
|
|
|
1428
1715
|
llm_test_case = None
|
|
1429
1716
|
if trace.input:
|
|
@@ -1438,17 +1725,32 @@ async def _a_execute_trace_test_case(
|
|
|
1438
1725
|
tools_called=trace.tools_called,
|
|
1439
1726
|
expected_tools=trace.expected_tools,
|
|
1440
1727
|
)
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1728
|
+
|
|
1729
|
+
if not requires_trace:
|
|
1730
|
+
if llm_test_case is None:
|
|
1731
|
+
trace.status = TraceSpanStatus.ERRORED
|
|
1732
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1733
|
+
if trace.root_spans:
|
|
1734
|
+
trace.root_spans[0].status = TraceSpanStatus.ERRORED
|
|
1735
|
+
trace.root_spans[0].error = format_error_text(
|
|
1736
|
+
DeepEvalError(
|
|
1737
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
1738
|
+
"Are you sure you called `update_current_trace()`?"
|
|
1739
|
+
)
|
|
1740
|
+
)
|
|
1741
|
+
if progress and pbar_eval_id is not None:
|
|
1742
|
+
update_pbar(
|
|
1743
|
+
progress,
|
|
1744
|
+
pbar_eval_id,
|
|
1745
|
+
advance=count_total_metrics_for_trace(trace),
|
|
1746
|
+
)
|
|
1747
|
+
return
|
|
1445
1748
|
|
|
1446
1749
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
1447
|
-
metrics: List[BaseMetric] = trace.metrics
|
|
1448
1750
|
test_case: Optional[LLMTestCase] = llm_test_case
|
|
1449
1751
|
|
|
1450
1752
|
# add trace if task completion
|
|
1451
|
-
if
|
|
1753
|
+
if requires_trace:
|
|
1452
1754
|
if test_case is None:
|
|
1453
1755
|
test_case = LLMTestCase(input="None")
|
|
1454
1756
|
test_case._trace_dict = trace_manager.create_nested_spans_dict(
|
|
@@ -1578,11 +1880,12 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1578
1880
|
pbar_eval_id: Optional[int] = None,
|
|
1579
1881
|
):
|
|
1580
1882
|
# Create API Span
|
|
1581
|
-
metrics: List[BaseMetric] = span.metrics
|
|
1883
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
1582
1884
|
|
|
1583
1885
|
api_span: BaseApiSpan = (
|
|
1584
1886
|
trace_manager._convert_span_to_api_span(span)
|
|
1585
1887
|
)
|
|
1888
|
+
|
|
1586
1889
|
if isinstance(span, AgentSpan):
|
|
1587
1890
|
trace_api.agent_spans.append(api_span)
|
|
1588
1891
|
elif isinstance(span, LlmSpan):
|
|
@@ -1595,9 +1898,30 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1595
1898
|
else:
|
|
1596
1899
|
trace_api.base_spans.append(api_span)
|
|
1597
1900
|
|
|
1901
|
+
# Skip errored trace/span
|
|
1902
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
1903
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1904
|
+
api_span.error = span.error or _trace_error(
|
|
1905
|
+
current_trace
|
|
1906
|
+
)
|
|
1907
|
+
if progress and pbar_eval_id is not None:
|
|
1908
|
+
update_pbar(
|
|
1909
|
+
progress,
|
|
1910
|
+
pbar_eval_id,
|
|
1911
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1912
|
+
)
|
|
1913
|
+
return
|
|
1914
|
+
|
|
1598
1915
|
for child in span.children:
|
|
1599
1916
|
dfs(child, progress, pbar_eval_id)
|
|
1600
1917
|
|
|
1918
|
+
if not span.metrics:
|
|
1919
|
+
return
|
|
1920
|
+
|
|
1921
|
+
requires_trace = any(
|
|
1922
|
+
metric.requires_trace for metric in metrics
|
|
1923
|
+
)
|
|
1924
|
+
|
|
1601
1925
|
llm_test_case = None
|
|
1602
1926
|
if span.input is not None:
|
|
1603
1927
|
llm_test_case = LLMTestCase(
|
|
@@ -1613,20 +1937,29 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1613
1937
|
tools_called=span.tools_called,
|
|
1614
1938
|
expected_tools=span.expected_tools,
|
|
1615
1939
|
)
|
|
1616
|
-
if span.metrics is None or llm_test_case is None:
|
|
1617
|
-
return
|
|
1618
|
-
|
|
1619
|
-
has_task_completion = any(
|
|
1620
|
-
isinstance(metric, TaskCompletionMetric)
|
|
1621
|
-
for metric in metrics
|
|
1622
|
-
)
|
|
1623
1940
|
|
|
1624
|
-
if
|
|
1941
|
+
if requires_trace:
|
|
1625
1942
|
if llm_test_case is None:
|
|
1626
1943
|
llm_test_case = LLMTestCase(input="None")
|
|
1627
1944
|
llm_test_case._trace_dict = (
|
|
1628
1945
|
trace_manager.create_nested_spans_dict(span)
|
|
1629
1946
|
)
|
|
1947
|
+
else:
|
|
1948
|
+
if llm_test_case is None:
|
|
1949
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1950
|
+
api_span.error = format_error_text(
|
|
1951
|
+
DeepEvalError(
|
|
1952
|
+
"Span has metrics but no LLMTestCase. "
|
|
1953
|
+
"Are you sure you called `update_current_span()`?"
|
|
1954
|
+
)
|
|
1955
|
+
)
|
|
1956
|
+
if progress and pbar_eval_id is not None:
|
|
1957
|
+
update_pbar(
|
|
1958
|
+
progress,
|
|
1959
|
+
pbar_eval_id,
|
|
1960
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1961
|
+
)
|
|
1962
|
+
return
|
|
1630
1963
|
|
|
1631
1964
|
# Preparing metric calculation
|
|
1632
1965
|
api_span.metrics_data = []
|
|
@@ -1670,77 +2003,123 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1670
2003
|
start_time = time.perf_counter()
|
|
1671
2004
|
|
|
1672
2005
|
# Handle trace-level metrics
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
input=str(current_trace.input),
|
|
1683
|
-
actual_output=(
|
|
1684
|
-
str(current_trace.output)
|
|
1685
|
-
if current_trace.output is not None
|
|
1686
|
-
else None
|
|
2006
|
+
skip_metrics_for_this_golden = False
|
|
2007
|
+
if _skip_metrics_for_error(trace=current_trace):
|
|
2008
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2009
|
+
if progress and pbar_eval_id is not None:
|
|
2010
|
+
update_pbar(
|
|
2011
|
+
progress,
|
|
2012
|
+
pbar_eval_id,
|
|
2013
|
+
advance=count_total_metrics_for_trace(
|
|
2014
|
+
current_trace
|
|
1687
2015
|
),
|
|
1688
|
-
expected_output=current_trace.expected_output,
|
|
1689
|
-
context=current_trace.context,
|
|
1690
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1691
|
-
tools_called=current_trace.tools_called,
|
|
1692
|
-
expected_tools=current_trace.expected_tools,
|
|
1693
2016
|
)
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
2017
|
+
else:
|
|
2018
|
+
if current_trace.metrics:
|
|
2019
|
+
requires_trace = any(
|
|
2020
|
+
metric.requires_trace
|
|
2021
|
+
for metric in current_trace.metrics
|
|
1697
2022
|
)
|
|
1698
2023
|
|
|
1699
|
-
|
|
1700
|
-
if
|
|
1701
|
-
llm_test_case = LLMTestCase(
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
2024
|
+
llm_test_case = None
|
|
2025
|
+
if current_trace.input:
|
|
2026
|
+
llm_test_case = LLMTestCase(
|
|
2027
|
+
input=str(current_trace.input),
|
|
2028
|
+
actual_output=(
|
|
2029
|
+
str(current_trace.output)
|
|
2030
|
+
if current_trace.output is not None
|
|
2031
|
+
else None
|
|
2032
|
+
),
|
|
2033
|
+
expected_output=current_trace.expected_output,
|
|
2034
|
+
context=current_trace.context,
|
|
2035
|
+
retrieval_context=current_trace.retrieval_context,
|
|
2036
|
+
tools_called=current_trace.tools_called,
|
|
2037
|
+
expected_tools=current_trace.expected_tools,
|
|
1705
2038
|
)
|
|
1706
|
-
)
|
|
1707
|
-
|
|
1708
|
-
for metric in current_trace.metrics:
|
|
1709
|
-
metric.skipped = False
|
|
1710
|
-
metric.error = None
|
|
1711
|
-
if display_config.verbose_mode is not None:
|
|
1712
|
-
metric.verbose_mode = display_config.verbose_mode
|
|
1713
|
-
|
|
1714
|
-
trace_api.metrics_data = []
|
|
1715
|
-
for metric in current_trace.metrics:
|
|
1716
|
-
res = _execute_metric(
|
|
1717
|
-
metric=metric,
|
|
1718
|
-
test_case=llm_test_case,
|
|
1719
|
-
show_metric_indicator=show_metric_indicator,
|
|
1720
|
-
in_component=True,
|
|
1721
|
-
error_config=error_config,
|
|
1722
|
-
)
|
|
1723
|
-
if res == "skip":
|
|
1724
|
-
continue
|
|
1725
|
-
|
|
1726
|
-
if not metric.skipped:
|
|
1727
|
-
metric_data = create_metric_data(metric)
|
|
1728
|
-
trace_api.metrics_data.append(metric_data)
|
|
1729
|
-
api_test_case.update_metric_data(metric_data)
|
|
1730
|
-
api_test_case.update_status(metric_data.success)
|
|
1731
|
-
update_pbar(progress, pbar_eval_id)
|
|
1732
|
-
|
|
1733
|
-
# Then handle span-level metrics
|
|
1734
|
-
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
1735
|
-
end_time = time.perf_counter()
|
|
1736
|
-
run_duration = end_time - start_time
|
|
1737
2039
|
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
2040
|
+
if requires_trace:
|
|
2041
|
+
if llm_test_case is None:
|
|
2042
|
+
llm_test_case = LLMTestCase(input="None")
|
|
2043
|
+
llm_test_case._trace_dict = (
|
|
2044
|
+
trace_manager.create_nested_spans_dict(
|
|
2045
|
+
current_trace.root_spans[0]
|
|
2046
|
+
)
|
|
2047
|
+
)
|
|
2048
|
+
else:
|
|
2049
|
+
if llm_test_case is None:
|
|
2050
|
+
current_trace.status = TraceSpanStatus.ERRORED
|
|
2051
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2052
|
+
if current_trace.root_spans:
|
|
2053
|
+
current_trace.root_spans[0].status = (
|
|
2054
|
+
TraceSpanStatus.ERRORED
|
|
2055
|
+
)
|
|
2056
|
+
current_trace.root_spans[0].error = (
|
|
2057
|
+
format_error_text(
|
|
2058
|
+
DeepEvalError(
|
|
2059
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
2060
|
+
"Are you sure you called `update_current_trace()`?"
|
|
2061
|
+
)
|
|
2062
|
+
)
|
|
2063
|
+
)
|
|
2064
|
+
if progress and pbar_eval_id is not None:
|
|
2065
|
+
update_pbar(
|
|
2066
|
+
progress,
|
|
2067
|
+
pbar_eval_id,
|
|
2068
|
+
advance=count_total_metrics_for_trace(
|
|
2069
|
+
current_trace
|
|
2070
|
+
),
|
|
2071
|
+
)
|
|
2072
|
+
skip_metrics_for_this_golden = True
|
|
2073
|
+
|
|
2074
|
+
if not skip_metrics_for_this_golden:
|
|
2075
|
+
for metric in current_trace.metrics:
|
|
2076
|
+
metric.skipped = False
|
|
2077
|
+
metric.error = None
|
|
2078
|
+
if display_config.verbose_mode is not None:
|
|
2079
|
+
metric.verbose_mode = (
|
|
2080
|
+
display_config.verbose_mode
|
|
2081
|
+
)
|
|
2082
|
+
|
|
2083
|
+
trace_api.metrics_data = []
|
|
2084
|
+
for metric in current_trace.metrics:
|
|
2085
|
+
res = _execute_metric(
|
|
2086
|
+
metric=metric,
|
|
2087
|
+
test_case=llm_test_case,
|
|
2088
|
+
show_metric_indicator=show_metric_indicator,
|
|
2089
|
+
in_component=True,
|
|
2090
|
+
error_config=error_config,
|
|
2091
|
+
)
|
|
2092
|
+
if res == "skip":
|
|
2093
|
+
continue
|
|
2094
|
+
|
|
2095
|
+
if not metric.skipped:
|
|
2096
|
+
metric_data = create_metric_data(metric)
|
|
2097
|
+
trace_api.metrics_data.append(metric_data)
|
|
2098
|
+
api_test_case.update_metric_data(
|
|
2099
|
+
metric_data
|
|
2100
|
+
)
|
|
2101
|
+
api_test_case.update_status(
|
|
2102
|
+
metric_data.success
|
|
2103
|
+
)
|
|
2104
|
+
update_pbar(progress, pbar_eval_id)
|
|
2105
|
+
|
|
2106
|
+
# Then handle span-level metrics
|
|
2107
|
+
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
2108
|
+
|
|
2109
|
+
end_time = time.perf_counter()
|
|
2110
|
+
run_duration = end_time - start_time
|
|
2111
|
+
# Update test run
|
|
2112
|
+
api_test_case.update_run_duration(run_duration)
|
|
2113
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
2114
|
+
main_result = create_test_result(api_test_case)
|
|
2115
|
+
trace_results = extract_trace_test_results(trace_api)
|
|
2116
|
+
unique_trace_results = filter_duplicate_results(
|
|
2117
|
+
main_result, trace_results
|
|
2118
|
+
)
|
|
2119
|
+
test_results.append(main_result)
|
|
2120
|
+
test_results.extend(unique_trace_results)
|
|
1742
2121
|
|
|
1743
|
-
|
|
2122
|
+
update_pbar(progress, pbar_id)
|
|
1744
2123
|
|
|
1745
2124
|
try:
|
|
1746
2125
|
if display_config.show_indicator and _use_bar_indicator:
|
|
@@ -1841,39 +2220,137 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1841
2220
|
}
|
|
1842
2221
|
|
|
1843
2222
|
def on_task_done(t: asyncio.Task):
|
|
2223
|
+
cancelled = False
|
|
2224
|
+
exc = None
|
|
2225
|
+
trace = None
|
|
2226
|
+
root = None
|
|
2227
|
+
resolved_trace_from_task = False
|
|
2228
|
+
resolved_root_from_task = False
|
|
2229
|
+
|
|
2230
|
+
# Task.exception() raises CancelledError if task was cancelled
|
|
2231
|
+
try:
|
|
2232
|
+
exc = t.exception()
|
|
2233
|
+
except asyncio.CancelledError:
|
|
2234
|
+
cancelled = True
|
|
2235
|
+
exc = None
|
|
2236
|
+
|
|
2237
|
+
meta = task_meta.get(t, {})
|
|
2238
|
+
golden_index = meta.get("golden_index")
|
|
2239
|
+
|
|
2240
|
+
if golden_index is not None and 0 <= golden_index < len(
|
|
2241
|
+
goldens
|
|
2242
|
+
):
|
|
2243
|
+
golden = goldens[golden_index]
|
|
2244
|
+
|
|
2245
|
+
def _mark_trace_error(trace, root, msg: str):
|
|
2246
|
+
now = time.perf_counter()
|
|
2247
|
+
trace.status = TraceSpanStatus.ERRORED
|
|
2248
|
+
# Close the trace so the API layer has a proper endTime
|
|
2249
|
+
if trace.end_time is None:
|
|
2250
|
+
trace.end_time = now
|
|
2251
|
+
if root:
|
|
2252
|
+
root.status = TraceSpanStatus.ERRORED
|
|
2253
|
+
root.error = msg
|
|
2254
|
+
if root.end_time is None:
|
|
2255
|
+
root.end_time = now
|
|
2256
|
+
|
|
2257
|
+
if exc is not None:
|
|
2258
|
+
msg = format_error_text(exc)
|
|
2259
|
+
trace, root = _resolve_trace_and_root_for_task(t)
|
|
2260
|
+
resolved_trace_from_task = bool(trace)
|
|
2261
|
+
resolved_root_from_task = bool(root)
|
|
2262
|
+
if trace:
|
|
2263
|
+
_mark_trace_error(trace, root, msg)
|
|
2264
|
+
else:
|
|
2265
|
+
for (
|
|
2266
|
+
trace
|
|
2267
|
+
) in trace_manager.integration_traces_to_evaluate:
|
|
2268
|
+
if (
|
|
2269
|
+
trace_manager.trace_uuid_to_golden.get(
|
|
2270
|
+
trace.uuid
|
|
2271
|
+
)
|
|
2272
|
+
is golden
|
|
2273
|
+
):
|
|
2274
|
+
root = _pick_root_for_marking(trace)
|
|
2275
|
+
_mark_trace_error(trace, root, msg)
|
|
2276
|
+
break
|
|
2277
|
+
|
|
2278
|
+
elif cancelled or t.cancelled():
|
|
2279
|
+
cancel_exc = DeepEvalError(
|
|
2280
|
+
"Task was cancelled (likely due to timeout)."
|
|
2281
|
+
)
|
|
2282
|
+
msg = format_error_text(cancel_exc)
|
|
2283
|
+
trace, root = _resolve_trace_and_root_for_task(t)
|
|
2284
|
+
resolved_trace_from_task = bool(trace)
|
|
2285
|
+
resolved_root_from_task = bool(root)
|
|
2286
|
+
if trace:
|
|
2287
|
+
_mark_trace_error(trace, root, msg)
|
|
2288
|
+
else:
|
|
2289
|
+
for (
|
|
2290
|
+
trace
|
|
2291
|
+
) in trace_manager.integration_traces_to_evaluate:
|
|
2292
|
+
if (
|
|
2293
|
+
trace_manager.trace_uuid_to_golden.get(
|
|
2294
|
+
trace.uuid
|
|
2295
|
+
)
|
|
2296
|
+
is golden
|
|
2297
|
+
):
|
|
2298
|
+
root = _pick_root_for_marking(trace)
|
|
2299
|
+
_mark_trace_error(trace, root, msg)
|
|
2300
|
+
break
|
|
2301
|
+
|
|
1844
2302
|
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1845
2303
|
# Using info level here to make it easy to spot these logs.
|
|
1846
|
-
|
|
1847
|
-
meta = task_meta.get(t, {})
|
|
2304
|
+
golden_name = meta.get("golden_name")
|
|
1848
2305
|
duration = time.perf_counter() - meta.get(
|
|
1849
2306
|
"started", started
|
|
1850
2307
|
)
|
|
1851
2308
|
|
|
1852
|
-
if
|
|
2309
|
+
if cancelled or exc is not None:
|
|
2310
|
+
if not resolved_trace_from_task:
|
|
2311
|
+
logger.warning(
|
|
2312
|
+
"[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
|
|
2313
|
+
t.get_name(),
|
|
2314
|
+
golden_name,
|
|
2315
|
+
)
|
|
2316
|
+
elif not resolved_root_from_task:
|
|
2317
|
+
logger.warning(
|
|
2318
|
+
"[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
|
|
2319
|
+
t.get_name(),
|
|
2320
|
+
trace.uuid,
|
|
2321
|
+
)
|
|
2322
|
+
|
|
2323
|
+
if cancelled:
|
|
1853
2324
|
logger.info(
|
|
1854
2325
|
"[deepeval] task CANCELLED %s after %.2fs meta=%r",
|
|
1855
2326
|
t.get_name(),
|
|
1856
2327
|
duration,
|
|
1857
2328
|
meta,
|
|
1858
2329
|
)
|
|
2330
|
+
elif exc is not None:
|
|
2331
|
+
logger.error(
|
|
2332
|
+
"[deepeval] task ERROR %s after %.2fs meta=%r",
|
|
2333
|
+
t.get_name(),
|
|
2334
|
+
duration,
|
|
2335
|
+
meta,
|
|
2336
|
+
exc_info=(
|
|
2337
|
+
type(exc),
|
|
2338
|
+
exc,
|
|
2339
|
+
getattr(exc, "__traceback__", None),
|
|
2340
|
+
),
|
|
2341
|
+
)
|
|
1859
2342
|
else:
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
meta,
|
|
1867
|
-
exc_info=(type(exc), exc, exc.__traceback__),
|
|
1868
|
-
)
|
|
1869
|
-
else:
|
|
1870
|
-
logger.info(
|
|
1871
|
-
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
1872
|
-
t.get_name(),
|
|
1873
|
-
duration,
|
|
1874
|
-
meta.get("golden_index"),
|
|
1875
|
-
)
|
|
2343
|
+
logger.info(
|
|
2344
|
+
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
2345
|
+
t.get_name(),
|
|
2346
|
+
duration,
|
|
2347
|
+
meta.get("golden_index"),
|
|
2348
|
+
)
|
|
1876
2349
|
|
|
2350
|
+
try:
|
|
2351
|
+
trace_manager.task_bindings.pop(t, None)
|
|
2352
|
+
except Exception:
|
|
2353
|
+
pass
|
|
1877
2354
|
update_pbar(progress, pbar_callback_id)
|
|
1878
2355
|
update_pbar(progress, pbar_id)
|
|
1879
2356
|
|
|
@@ -1918,6 +2395,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1918
2395
|
timeout=_gather_timeout(),
|
|
1919
2396
|
)
|
|
1920
2397
|
)
|
|
2398
|
+
|
|
1921
2399
|
except asyncio.TimeoutError:
|
|
1922
2400
|
import traceback
|
|
1923
2401
|
|
|
@@ -1987,10 +2465,11 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1987
2465
|
]
|
|
1988
2466
|
|
|
1989
2467
|
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
2468
|
+
if len(leftovers) > 0:
|
|
2469
|
+
logger.warning(
|
|
2470
|
+
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
2471
|
+
len(leftovers),
|
|
2472
|
+
)
|
|
1994
2473
|
for t in leftovers:
|
|
1995
2474
|
meta = task_meta.get(t, {})
|
|
1996
2475
|
name = t.get_name()
|
|
@@ -2130,7 +2609,10 @@ async def _a_evaluate_traces(
|
|
|
2130
2609
|
|
|
2131
2610
|
async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
|
|
2132
2611
|
async with semaphore:
|
|
2133
|
-
return await
|
|
2612
|
+
return await asyncio.wait_for(
|
|
2613
|
+
func(*args, **kwargs),
|
|
2614
|
+
timeout=_per_task_timeout(),
|
|
2615
|
+
)
|
|
2134
2616
|
|
|
2135
2617
|
eval_tasks = []
|
|
2136
2618
|
# Here, we will work off a fixed-set copy to avoid surprises from potential
|
|
@@ -2173,7 +2655,18 @@ async def _a_evaluate_traces(
|
|
|
2173
2655
|
)
|
|
2174
2656
|
eval_tasks.append(asyncio.create_task(task))
|
|
2175
2657
|
await asyncio.sleep(throttle_value)
|
|
2176
|
-
|
|
2658
|
+
|
|
2659
|
+
try:
|
|
2660
|
+
await asyncio.wait_for(
|
|
2661
|
+
asyncio.gather(*eval_tasks),
|
|
2662
|
+
timeout=_gather_timeout(),
|
|
2663
|
+
)
|
|
2664
|
+
except asyncio.TimeoutError:
|
|
2665
|
+
for t in eval_tasks:
|
|
2666
|
+
if not t.done():
|
|
2667
|
+
t.cancel()
|
|
2668
|
+
await asyncio.gather(*eval_tasks, return_exceptions=True)
|
|
2669
|
+
raise
|
|
2177
2670
|
|
|
2178
2671
|
|
|
2179
2672
|
async def _evaluate_test_case_pairs(
|
|
@@ -2196,7 +2689,10 @@ async def _evaluate_test_case_pairs(
|
|
|
2196
2689
|
|
|
2197
2690
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
2198
2691
|
async with semaphore:
|
|
2199
|
-
return await
|
|
2692
|
+
return await asyncio.wait_for(
|
|
2693
|
+
func(*args, **kwargs),
|
|
2694
|
+
timeout=_per_task_timeout(),
|
|
2695
|
+
)
|
|
2200
2696
|
|
|
2201
2697
|
tasks = []
|
|
2202
2698
|
for count, test_case_pair in enumerate(test_case_pairs):
|
|
@@ -2229,7 +2725,19 @@ async def _evaluate_test_case_pairs(
|
|
|
2229
2725
|
)
|
|
2230
2726
|
tasks.append(asyncio.create_task(task))
|
|
2231
2727
|
await asyncio.sleep(throttle_value)
|
|
2232
|
-
|
|
2728
|
+
|
|
2729
|
+
try:
|
|
2730
|
+
await asyncio.wait_for(
|
|
2731
|
+
asyncio.gather(*tasks),
|
|
2732
|
+
timeout=_gather_timeout(),
|
|
2733
|
+
)
|
|
2734
|
+
except asyncio.TimeoutError:
|
|
2735
|
+
# Cancel any still-pending tasks and drain them
|
|
2736
|
+
for t in tasks:
|
|
2737
|
+
if not t.done():
|
|
2738
|
+
t.cancel()
|
|
2739
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
2740
|
+
raise
|
|
2233
2741
|
|
|
2234
2742
|
|
|
2235
2743
|
def _execute_metric(
|
|
@@ -2251,7 +2759,7 @@ def _execute_metric(
|
|
|
2251
2759
|
return "skip"
|
|
2252
2760
|
else:
|
|
2253
2761
|
if error_config.ignore_errors:
|
|
2254
|
-
metric.error =
|
|
2762
|
+
metric.error = format_error_text(e)
|
|
2255
2763
|
metric.success = False
|
|
2256
2764
|
else:
|
|
2257
2765
|
raise
|
|
@@ -2263,19 +2771,19 @@ def _execute_metric(
|
|
|
2263
2771
|
return "skip"
|
|
2264
2772
|
else:
|
|
2265
2773
|
if error_config.ignore_errors:
|
|
2266
|
-
metric.error =
|
|
2774
|
+
metric.error = format_error_text(e)
|
|
2267
2775
|
metric.success = False
|
|
2268
2776
|
else:
|
|
2269
2777
|
raise
|
|
2270
2778
|
except Exception as e:
|
|
2271
2779
|
if error_config.ignore_errors:
|
|
2272
|
-
metric.error =
|
|
2780
|
+
metric.error = format_error_text(e)
|
|
2273
2781
|
metric.success = False
|
|
2274
2782
|
else:
|
|
2275
2783
|
raise
|
|
2276
2784
|
except Exception as e:
|
|
2277
2785
|
if error_config.ignore_errors:
|
|
2278
|
-
metric.error =
|
|
2786
|
+
metric.error = format_error_text(e)
|
|
2279
2787
|
metric.success = False
|
|
2280
2788
|
else:
|
|
2281
2789
|
raise
|