deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +104 -36
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +1662 -688
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
|
|
3
4
|
from rich.progress import (
|
|
@@ -43,9 +44,14 @@ from deepeval.tracing.api import (
|
|
|
43
44
|
)
|
|
44
45
|
from deepeval.dataset import Golden
|
|
45
46
|
from deepeval.contextvars import set_current_golden, reset_current_golden
|
|
46
|
-
from deepeval.errors import MissingTestCaseParamsError
|
|
47
|
+
from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
|
|
47
48
|
from deepeval.metrics.utils import copy_metrics
|
|
48
|
-
from deepeval.utils import
|
|
49
|
+
from deepeval.utils import (
|
|
50
|
+
get_or_create_event_loop,
|
|
51
|
+
shorten,
|
|
52
|
+
len_medium,
|
|
53
|
+
format_error_text,
|
|
54
|
+
)
|
|
49
55
|
from deepeval.telemetry import capture_evaluation_run
|
|
50
56
|
from deepeval.metrics import (
|
|
51
57
|
BaseMetric,
|
|
@@ -56,6 +62,11 @@ from deepeval.metrics import (
|
|
|
56
62
|
from deepeval.metrics.indicator import (
|
|
57
63
|
measure_metrics_with_indicator,
|
|
58
64
|
)
|
|
65
|
+
from deepeval.models.retry_policy import (
|
|
66
|
+
set_outer_deadline,
|
|
67
|
+
reset_outer_deadline,
|
|
68
|
+
run_sync_with_timeout,
|
|
69
|
+
)
|
|
59
70
|
from deepeval.test_case import (
|
|
60
71
|
LLMTestCase,
|
|
61
72
|
ConversationalTestCase,
|
|
@@ -82,10 +93,13 @@ from deepeval.evaluate.utils import (
|
|
|
82
93
|
create_metric_data,
|
|
83
94
|
create_test_result,
|
|
84
95
|
count_metrics_in_trace,
|
|
96
|
+
count_total_metrics_for_trace,
|
|
97
|
+
count_metrics_in_span_subtree,
|
|
85
98
|
extract_trace_test_results,
|
|
86
99
|
)
|
|
87
100
|
from deepeval.utils import add_pbar, update_pbar, custom_console
|
|
88
|
-
from deepeval.tracing.types import TestCaseMetricPair
|
|
101
|
+
from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
|
|
102
|
+
from deepeval.tracing.api import TraceSpanApiStatus
|
|
89
103
|
from deepeval.config.settings import get_settings
|
|
90
104
|
from deepeval.test_run import TEMP_FILE_PATH
|
|
91
105
|
from deepeval.confident.api import is_confident
|
|
@@ -97,6 +111,108 @@ from deepeval.test_run.hyperparameters import (
|
|
|
97
111
|
logger = logging.getLogger(__name__)
|
|
98
112
|
|
|
99
113
|
|
|
114
|
+
def _skip_metrics_for_error(
|
|
115
|
+
span: Optional[BaseSpan] = None,
|
|
116
|
+
trace: Optional[Trace] = None,
|
|
117
|
+
) -> bool:
|
|
118
|
+
# trace failure: skip everything under this trace
|
|
119
|
+
if trace is not None and trace.status == TraceSpanStatus.ERRORED:
|
|
120
|
+
return True
|
|
121
|
+
# span failure: skip this span’s metrics
|
|
122
|
+
if span is not None and span.status == TraceSpanStatus.ERRORED:
|
|
123
|
+
return True
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _trace_error(current_trace: Trace) -> Optional[str]:
|
|
128
|
+
def _first_err(s: BaseSpan) -> Optional[str]:
|
|
129
|
+
if s.status == TraceSpanStatus.ERRORED and s.error:
|
|
130
|
+
return s.error
|
|
131
|
+
for c in s.children or []:
|
|
132
|
+
e = _first_err(c)
|
|
133
|
+
if e:
|
|
134
|
+
return e
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
for root in current_trace.root_spans or []:
|
|
138
|
+
e = _first_err(root)
|
|
139
|
+
if e:
|
|
140
|
+
return e
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _get_trace_by_uuid_anywhere(trace_uuid: str):
|
|
145
|
+
"""
|
|
146
|
+
Resolver for a trace UUID across the manager's state.
|
|
147
|
+
|
|
148
|
+
First tries the manager's indexed lookup, which (covers active/in-flight traces,
|
|
149
|
+
then does a linear scan of the full `trace_manager.traces` list, which covers
|
|
150
|
+
traces that were recorded/closed earlier or not yet indexed. Returns
|
|
151
|
+
the concrete Trace object or None if not found.
|
|
152
|
+
"""
|
|
153
|
+
tr = trace_manager.get_trace_by_uuid(trace_uuid)
|
|
154
|
+
if tr:
|
|
155
|
+
return tr
|
|
156
|
+
for tr in trace_manager.traces:
|
|
157
|
+
if tr.uuid == trace_uuid:
|
|
158
|
+
return tr
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _pick_root_for_marking(trace):
|
|
163
|
+
"""
|
|
164
|
+
Choose the most appropriate root span to annotate on error/cancel.
|
|
165
|
+
|
|
166
|
+
Heuristic:
|
|
167
|
+
- Prefer the most recent open root, which will have no `end_time` since this is the
|
|
168
|
+
span currently in flight.
|
|
169
|
+
- If none are open, use the last root span if it exists.
|
|
170
|
+
- If the trace has no roots, return None.
|
|
171
|
+
|
|
172
|
+
This favors marking the active root in multi root traces while remaining
|
|
173
|
+
stable for already closed traces.
|
|
174
|
+
"""
|
|
175
|
+
open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
|
|
176
|
+
return (
|
|
177
|
+
open_roots[-1]
|
|
178
|
+
if open_roots
|
|
179
|
+
else (trace.root_spans[-1] if trace.root_spans else None)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _resolve_trace_and_root_for_task(t: asyncio.Task):
|
|
184
|
+
"""
|
|
185
|
+
Resolve trace and root for a completed task using the weak binding map.
|
|
186
|
+
|
|
187
|
+
Steps:
|
|
188
|
+
1. Look up the task in `trace_manager.task_bindings` to get the
|
|
189
|
+
bound `trace_uuid` and, if available, `root_span_uuid`.
|
|
190
|
+
2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
|
|
191
|
+
3. If a bound root UUID exists, try to find that exact root on the trace.
|
|
192
|
+
4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
|
|
193
|
+
|
|
194
|
+
Returns a trace / root tuple. Either may be `None` when no binding is
|
|
195
|
+
present. This function is used by `on_task_done` to robustly mark error/cancel
|
|
196
|
+
states without assuming a single root trace or a root that is still open.
|
|
197
|
+
"""
|
|
198
|
+
binding = trace_manager.task_bindings.get(t) or {}
|
|
199
|
+
trace_uuid = binding.get("trace_uuid")
|
|
200
|
+
root_span_uuid = binding.get("root_span_uuid")
|
|
201
|
+
|
|
202
|
+
trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
|
|
203
|
+
root = None
|
|
204
|
+
|
|
205
|
+
if trace and root_span_uuid:
|
|
206
|
+
root = next(
|
|
207
|
+
(rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if trace and root is None:
|
|
211
|
+
root = _pick_root_for_marking(trace)
|
|
212
|
+
|
|
213
|
+
return trace, root
|
|
214
|
+
|
|
215
|
+
|
|
100
216
|
async def _snapshot_tasks():
|
|
101
217
|
cur = asyncio.current_task()
|
|
102
218
|
# `all_tasks` returns tasks for the current running loop only
|
|
@@ -115,6 +231,32 @@ def _gather_timeout() -> float:
|
|
|
115
231
|
)
|
|
116
232
|
|
|
117
233
|
|
|
234
|
+
def filter_duplicate_results(
|
|
235
|
+
main_result: TestResult, results: List[TestResult]
|
|
236
|
+
) -> List[TestResult]:
|
|
237
|
+
return [
|
|
238
|
+
result
|
|
239
|
+
for result in results
|
|
240
|
+
if not (
|
|
241
|
+
(result.input == main_result.input)
|
|
242
|
+
and (result.actual_output == main_result.actual_output)
|
|
243
|
+
and (result.metrics_data == main_result.metrics_data)
|
|
244
|
+
)
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
|
|
249
|
+
token = set_outer_deadline(timeout)
|
|
250
|
+
try:
|
|
251
|
+
if inspect.isawaitable(obj):
|
|
252
|
+
coro = obj
|
|
253
|
+
else:
|
|
254
|
+
coro = obj(*args, **kwargs)
|
|
255
|
+
return await asyncio.wait_for(coro, timeout=timeout)
|
|
256
|
+
finally:
|
|
257
|
+
reset_outer_deadline(token)
|
|
258
|
+
|
|
259
|
+
|
|
118
260
|
###########################################
|
|
119
261
|
### E2E Evals #############################
|
|
120
262
|
###########################################
|
|
@@ -146,6 +288,13 @@ def execute_test_cases(
|
|
|
146
288
|
|
|
147
289
|
test_run_manager.save_to_disk = cache_config.write_cache
|
|
148
290
|
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
291
|
+
if test_run is None:
|
|
292
|
+
# ensure we have a test_run ( in case it couldn't be loaded from disk )
|
|
293
|
+
test_run_manager.create_test_run(identifier=identifier)
|
|
294
|
+
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
295
|
+
|
|
296
|
+
# capture once for inner closures
|
|
297
|
+
hyperparameters = test_run.hyperparameters if test_run is not None else None
|
|
149
298
|
|
|
150
299
|
if display_config.verbose_mode is not None:
|
|
151
300
|
for metric in metrics:
|
|
@@ -166,176 +315,228 @@ def execute_test_cases(
|
|
|
166
315
|
test_results: List[TestResult] = []
|
|
167
316
|
|
|
168
317
|
def evaluate_test_cases(
|
|
169
|
-
progress: Optional[Progress] = None, pbar_id: Optional[
|
|
318
|
+
progress: Optional[Progress] = None, pbar_id: Optional[int] = None
|
|
170
319
|
):
|
|
171
320
|
llm_test_case_count = -1
|
|
321
|
+
mllm_test_case_count = -1
|
|
172
322
|
conversational_test_case_count = -1
|
|
173
323
|
show_metric_indicator = (
|
|
174
324
|
display_config.show_indicator and not _use_bar_indicator
|
|
175
325
|
)
|
|
176
326
|
for i, test_case in enumerate(test_cases):
|
|
327
|
+
# skip what we know we won't run
|
|
328
|
+
if isinstance(test_case, LLMTestCase):
|
|
329
|
+
if not llm_metrics:
|
|
330
|
+
update_pbar(progress, pbar_id)
|
|
331
|
+
continue
|
|
332
|
+
per_case_total = len(llm_metrics)
|
|
333
|
+
elif isinstance(test_case, MLLMTestCase):
|
|
334
|
+
if not mllm_metrics:
|
|
335
|
+
update_pbar(progress, pbar_id)
|
|
336
|
+
continue
|
|
337
|
+
per_case_total = len(mllm_metrics)
|
|
338
|
+
elif isinstance(test_case, ConversationalTestCase):
|
|
339
|
+
if not conversational_metrics:
|
|
340
|
+
update_pbar(progress, pbar_id)
|
|
341
|
+
continue
|
|
342
|
+
per_case_total = len(conversational_metrics)
|
|
343
|
+
|
|
177
344
|
pbar_test_case_id = add_pbar(
|
|
178
345
|
progress,
|
|
179
346
|
f" 🎯 Evaluating test case #{i}",
|
|
180
|
-
total=
|
|
347
|
+
total=per_case_total,
|
|
181
348
|
)
|
|
182
|
-
with capture_evaluation_run("test case"):
|
|
183
|
-
for metric in metrics:
|
|
184
|
-
metric.error = None # Reset metric error
|
|
185
|
-
|
|
186
|
-
if isinstance(test_case, LLMTestCase):
|
|
187
|
-
if len(llm_metrics) == 0:
|
|
188
|
-
continue
|
|
189
|
-
|
|
190
|
-
llm_test_case_count += 1
|
|
191
|
-
cached_test_case = None
|
|
192
|
-
if cache_config.use_cache:
|
|
193
|
-
cached_test_case = (
|
|
194
|
-
global_test_run_cache_manager.get_cached_test_case(
|
|
195
|
-
test_case, test_run.hyperparameters
|
|
196
|
-
)
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
##### Metric Calculation #####
|
|
200
|
-
api_test_case: LLMApiTestCase = create_api_test_case(
|
|
201
|
-
test_case=test_case, index=llm_test_case_count
|
|
202
|
-
)
|
|
203
|
-
new_cached_test_case: CachedTestCase = CachedTestCase()
|
|
204
|
-
|
|
205
|
-
test_start_time = time.perf_counter()
|
|
206
|
-
read_all_metrics_from_cache = True
|
|
207
|
-
for metric in llm_metrics:
|
|
208
|
-
metric_data = None
|
|
209
|
-
if cached_test_case is not None:
|
|
210
|
-
cached_metric_data = Cache.get_metric_data(
|
|
211
|
-
metric, cached_test_case
|
|
212
|
-
)
|
|
213
|
-
if cached_metric_data:
|
|
214
|
-
metric_data = cached_metric_data.metric_data
|
|
215
349
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
updated_cached_metric_data = CachedMetricData(
|
|
235
|
-
metric_data=cache_metric_data,
|
|
236
|
-
metric_configuration=Cache.create_metric_configuration(
|
|
237
|
-
metric
|
|
238
|
-
),
|
|
239
|
-
)
|
|
240
|
-
new_cached_test_case.cached_metrics_data.append(
|
|
241
|
-
updated_cached_metric_data
|
|
242
|
-
)
|
|
243
|
-
update_pbar(progress, pbar_test_case_id)
|
|
244
|
-
|
|
245
|
-
test_end_time = time.perf_counter()
|
|
246
|
-
if read_all_metrics_from_cache:
|
|
247
|
-
run_duration = 0
|
|
248
|
-
else:
|
|
249
|
-
run_duration = test_end_time - test_start_time
|
|
250
|
-
api_test_case.update_run_duration(run_duration)
|
|
251
|
-
|
|
252
|
-
### Update Test Run ###
|
|
253
|
-
test_run_manager.update_test_run(api_test_case, test_case)
|
|
254
|
-
|
|
255
|
-
### Cache Test Run ###
|
|
256
|
-
global_test_run_cache_manager.cache_test_case(
|
|
257
|
-
test_case,
|
|
258
|
-
new_cached_test_case,
|
|
259
|
-
test_run.hyperparameters,
|
|
260
|
-
)
|
|
261
|
-
global_test_run_cache_manager.cache_test_case(
|
|
262
|
-
test_case,
|
|
263
|
-
new_cached_test_case,
|
|
264
|
-
test_run.hyperparameters,
|
|
265
|
-
to_temp=True,
|
|
350
|
+
metrics_for_case = (
|
|
351
|
+
llm_metrics
|
|
352
|
+
if isinstance(test_case, LLMTestCase)
|
|
353
|
+
else (
|
|
354
|
+
mllm_metrics
|
|
355
|
+
if isinstance(test_case, MLLMTestCase)
|
|
356
|
+
else conversational_metrics
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
api_test_case = create_api_test_case(
|
|
360
|
+
test_case=test_case,
|
|
361
|
+
index=(
|
|
362
|
+
llm_test_case_count + 1
|
|
363
|
+
if isinstance(test_case, LLMTestCase)
|
|
364
|
+
else (
|
|
365
|
+
mllm_test_case_count + 1
|
|
366
|
+
if isinstance(test_case, MLLMTestCase)
|
|
367
|
+
else conversational_test_case_count + 1
|
|
266
368
|
)
|
|
369
|
+
),
|
|
370
|
+
)
|
|
371
|
+
emitted = [False] * len(metrics_for_case)
|
|
372
|
+
index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
|
|
373
|
+
current_index = -1
|
|
374
|
+
start_time = time.perf_counter()
|
|
375
|
+
deadline_timeout = _per_task_timeout()
|
|
376
|
+
deadline_token = set_outer_deadline(deadline_timeout)
|
|
377
|
+
new_cached_test_case: CachedTestCase = None
|
|
378
|
+
try:
|
|
267
379
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
380
|
+
def _run_case():
|
|
381
|
+
nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
|
|
382
|
+
with capture_evaluation_run("test case"):
|
|
383
|
+
for metric in metrics:
|
|
384
|
+
metric.error = None # Reset metric error
|
|
385
|
+
|
|
386
|
+
if isinstance(test_case, LLMTestCase):
|
|
387
|
+
llm_test_case_count += 1
|
|
388
|
+
cached_test_case = None
|
|
389
|
+
if cache_config.use_cache:
|
|
390
|
+
cached_test_case = global_test_run_cache_manager.get_cached_test_case(
|
|
391
|
+
test_case, hyperparameters
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
##### Metric Calculation #####
|
|
395
|
+
new_cached_test_case = CachedTestCase()
|
|
396
|
+
|
|
397
|
+
for metric in llm_metrics:
|
|
398
|
+
current_index = index_of[id(metric)]
|
|
399
|
+
metric_data = None
|
|
400
|
+
if cached_test_case is not None:
|
|
401
|
+
cached_metric_data = Cache.get_metric_data(
|
|
402
|
+
metric, cached_test_case
|
|
403
|
+
)
|
|
404
|
+
if cached_metric_data:
|
|
405
|
+
metric_data = (
|
|
406
|
+
cached_metric_data.metric_data
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
if metric_data is None:
|
|
410
|
+
res = _execute_metric(
|
|
411
|
+
metric=metric,
|
|
412
|
+
test_case=test_case,
|
|
413
|
+
show_metric_indicator=show_metric_indicator,
|
|
414
|
+
in_component=False,
|
|
415
|
+
error_config=error_config,
|
|
416
|
+
)
|
|
417
|
+
if res == "skip":
|
|
418
|
+
continue
|
|
419
|
+
metric_data = create_metric_data(metric)
|
|
420
|
+
|
|
421
|
+
# here, we will check for an additional property on the flattened test cases to see if updating is necessary
|
|
422
|
+
api_test_case.update_metric_data(metric_data)
|
|
423
|
+
emitted[current_index] = True
|
|
424
|
+
if metric.error is None:
|
|
425
|
+
cache_metric_data = deepcopy(metric_data)
|
|
426
|
+
cache_metric_data.evaluation_cost = 0 # Cached metrics will have evaluation cost as 0, not None.
|
|
427
|
+
updated_cached_metric_data = CachedMetricData(
|
|
428
|
+
metric_data=cache_metric_data,
|
|
429
|
+
metric_configuration=Cache.create_metric_configuration(
|
|
430
|
+
metric
|
|
431
|
+
),
|
|
432
|
+
)
|
|
433
|
+
new_cached_test_case.cached_metrics_data.append(
|
|
434
|
+
updated_cached_metric_data
|
|
435
|
+
)
|
|
436
|
+
update_pbar(progress, pbar_test_case_id)
|
|
437
|
+
|
|
438
|
+
# No caching and not sending test cases to Confident AI for multimodal metrics yet
|
|
439
|
+
elif isinstance(test_case, MLLMTestCase):
|
|
440
|
+
mllm_test_case_count += 1
|
|
441
|
+
for metric in mllm_metrics:
|
|
442
|
+
current_index = index_of[id(metric)]
|
|
443
|
+
res = _execute_metric(
|
|
444
|
+
metric=metric,
|
|
445
|
+
test_case=test_case,
|
|
446
|
+
show_metric_indicator=show_metric_indicator,
|
|
447
|
+
in_component=False,
|
|
448
|
+
error_config=error_config,
|
|
449
|
+
)
|
|
450
|
+
if res == "skip":
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
metric_data = create_metric_data(metric)
|
|
454
|
+
api_test_case.update_metric_data(metric_data)
|
|
455
|
+
emitted[current_index] = True
|
|
456
|
+
update_pbar(progress, pbar_test_case_id)
|
|
457
|
+
|
|
458
|
+
# No caching for conversational metrics yet
|
|
459
|
+
elif isinstance(test_case, ConversationalTestCase):
|
|
460
|
+
conversational_test_case_count += 1
|
|
461
|
+
for metric in conversational_metrics:
|
|
462
|
+
current_index = index_of[id(metric)]
|
|
463
|
+
res = _execute_metric(
|
|
464
|
+
metric=metric,
|
|
465
|
+
test_case=test_case,
|
|
466
|
+
show_metric_indicator=show_metric_indicator,
|
|
467
|
+
in_component=False,
|
|
468
|
+
error_config=error_config,
|
|
469
|
+
)
|
|
470
|
+
if res == "skip":
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
metric_data = create_metric_data(metric)
|
|
474
|
+
api_test_case.update_metric_data(metric_data)
|
|
475
|
+
emitted[current_index] = True
|
|
476
|
+
update_pbar(progress, pbar_test_case_id)
|
|
477
|
+
|
|
478
|
+
run_sync_with_timeout(_run_case, deadline_timeout)
|
|
479
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
480
|
+
msg = (
|
|
481
|
+
f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
|
|
482
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
483
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
484
|
+
)
|
|
485
|
+
for i, m in enumerate(metrics_for_case):
|
|
486
|
+
if getattr(m, "skipped", False):
|
|
271
487
|
continue
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
)
|
|
276
|
-
test_start_time = time.perf_counter()
|
|
277
|
-
for metric in mllm_metrics:
|
|
278
|
-
res = _execute_metric(
|
|
279
|
-
metric=metric,
|
|
280
|
-
test_case=test_case,
|
|
281
|
-
show_metric_indicator=show_metric_indicator,
|
|
282
|
-
in_component=False,
|
|
283
|
-
error_config=error_config,
|
|
284
|
-
)
|
|
285
|
-
if res == "skip":
|
|
286
|
-
continue
|
|
287
|
-
|
|
288
|
-
metric_data = create_metric_data(metric)
|
|
289
|
-
api_test_case.update_metric_data(metric_data)
|
|
290
|
-
update_pbar(progress, pbar_test_case_id)
|
|
291
|
-
|
|
292
|
-
test_end_time = time.perf_counter()
|
|
293
|
-
if len(mllm_metrics) > 0:
|
|
294
|
-
run_duration = test_end_time - test_start_time
|
|
295
|
-
api_test_case.update_run_duration(run_duration)
|
|
296
|
-
|
|
297
|
-
### Update Test Run ###
|
|
298
|
-
test_run_manager.update_test_run(api_test_case, test_case)
|
|
299
|
-
|
|
300
|
-
# No caching for conversational metrics yet
|
|
301
|
-
elif isinstance(test_case, ConversationalTestCase):
|
|
302
|
-
if len(metrics) == 0:
|
|
488
|
+
# already finished or errored? leave it
|
|
489
|
+
if getattr(m, "success", None) is not None or getattr(
|
|
490
|
+
m, "error", None
|
|
491
|
+
):
|
|
303
492
|
continue
|
|
493
|
+
if i == current_index:
|
|
494
|
+
m.success = False
|
|
495
|
+
m.error = msg
|
|
496
|
+
elif i > current_index:
|
|
497
|
+
m.success = False
|
|
498
|
+
m.error = "Skipped due to case timeout."
|
|
499
|
+
|
|
500
|
+
if not error_config.ignore_errors:
|
|
501
|
+
raise
|
|
304
502
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
503
|
+
finally:
|
|
504
|
+
try:
|
|
505
|
+
if (
|
|
506
|
+
isinstance(test_case, LLMTestCase)
|
|
507
|
+
and new_cached_test_case is not None
|
|
508
|
+
):
|
|
509
|
+
### Cache Test Run ###
|
|
510
|
+
global_test_run_cache_manager.cache_test_case(
|
|
511
|
+
test_case,
|
|
512
|
+
new_cached_test_case,
|
|
513
|
+
hyperparameters,
|
|
310
514
|
)
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
metric=metric,
|
|
317
|
-
test_case=test_case,
|
|
318
|
-
show_metric_indicator=show_metric_indicator,
|
|
319
|
-
in_component=False,
|
|
320
|
-
error_config=error_config,
|
|
515
|
+
global_test_run_cache_manager.cache_test_case(
|
|
516
|
+
test_case,
|
|
517
|
+
new_cached_test_case,
|
|
518
|
+
hyperparameters,
|
|
519
|
+
to_temp=True,
|
|
321
520
|
)
|
|
322
|
-
if res == "skip":
|
|
323
|
-
continue
|
|
324
|
-
|
|
325
|
-
metric_data = create_metric_data(metric)
|
|
326
|
-
api_test_case.update_metric_data(metric_data)
|
|
327
|
-
update_pbar(progress, pbar_test_case_id)
|
|
328
521
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
522
|
+
# Attach MetricData for *all* metrics (finished or synthesized)
|
|
523
|
+
for i, m in enumerate(metrics_for_case):
|
|
524
|
+
if getattr(m, "skipped", False):
|
|
525
|
+
continue
|
|
526
|
+
if not emitted[i]:
|
|
527
|
+
api_test_case.update_metric_data(
|
|
528
|
+
create_metric_data(m)
|
|
529
|
+
)
|
|
332
530
|
|
|
333
|
-
|
|
531
|
+
elapsed = time.perf_counter() - start_time
|
|
532
|
+
api_test_case.update_run_duration(
|
|
533
|
+
elapsed if elapsed >= 0 else deadline_timeout
|
|
534
|
+
)
|
|
334
535
|
test_run_manager.update_test_run(api_test_case, test_case)
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
536
|
+
test_results.append(create_test_result(api_test_case))
|
|
537
|
+
update_pbar(progress, pbar_id)
|
|
538
|
+
finally:
|
|
539
|
+
reset_outer_deadline(deadline_token)
|
|
339
540
|
|
|
340
541
|
if display_config.show_indicator and _use_bar_indicator:
|
|
341
542
|
progress = Progress(
|
|
@@ -380,7 +581,10 @@ async def a_execute_test_cases(
|
|
|
380
581
|
|
|
381
582
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
382
583
|
async with semaphore:
|
|
383
|
-
|
|
584
|
+
timeout = _per_task_timeout()
|
|
585
|
+
return await _await_with_outer_deadline(
|
|
586
|
+
func, *args, timeout=timeout, **kwargs
|
|
587
|
+
)
|
|
384
588
|
|
|
385
589
|
global_test_run_cache_manager.disable_write_cache = (
|
|
386
590
|
cache_config.write_cache is False
|
|
@@ -483,7 +687,7 @@ async def a_execute_test_cases(
|
|
|
483
687
|
|
|
484
688
|
task = execute_with_semaphore(
|
|
485
689
|
func=_a_execute_conversational_test_cases,
|
|
486
|
-
metrics=copy_metrics(
|
|
690
|
+
metrics=copy_metrics(conversational_metrics),
|
|
487
691
|
test_case=test_case,
|
|
488
692
|
test_run_manager=test_run_manager,
|
|
489
693
|
test_results=test_results,
|
|
@@ -499,7 +703,22 @@ async def a_execute_test_cases(
|
|
|
499
703
|
tasks.append(asyncio.create_task(task))
|
|
500
704
|
|
|
501
705
|
await asyncio.sleep(async_config.throttle_value)
|
|
502
|
-
|
|
706
|
+
|
|
707
|
+
try:
|
|
708
|
+
await asyncio.wait_for(
|
|
709
|
+
asyncio.gather(*tasks),
|
|
710
|
+
timeout=_gather_timeout(),
|
|
711
|
+
)
|
|
712
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
713
|
+
for t in tasks:
|
|
714
|
+
if not t.done():
|
|
715
|
+
t.cancel()
|
|
716
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
717
|
+
logging.getLogger("deepeval").error(
|
|
718
|
+
"Gather timed out after %.1fs. Some metrics may be marked as timed out.",
|
|
719
|
+
_gather_timeout(),
|
|
720
|
+
)
|
|
721
|
+
|
|
503
722
|
else:
|
|
504
723
|
for test_case in test_cases:
|
|
505
724
|
with capture_evaluation_run("test case"):
|
|
@@ -572,7 +791,19 @@ async def a_execute_test_cases(
|
|
|
572
791
|
tasks.append(asyncio.create_task(task))
|
|
573
792
|
|
|
574
793
|
await asyncio.sleep(async_config.throttle_value)
|
|
575
|
-
|
|
794
|
+
|
|
795
|
+
try:
|
|
796
|
+
await asyncio.wait_for(
|
|
797
|
+
asyncio.gather(*tasks),
|
|
798
|
+
timeout=_gather_timeout(),
|
|
799
|
+
)
|
|
800
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
801
|
+
# Cancel any still-pending tasks and drain them
|
|
802
|
+
for t in tasks:
|
|
803
|
+
if not t.done():
|
|
804
|
+
t.cancel()
|
|
805
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
806
|
+
raise
|
|
576
807
|
|
|
577
808
|
return test_results
|
|
578
809
|
|
|
@@ -593,6 +824,7 @@ async def _a_execute_llm_test_cases(
|
|
|
593
824
|
progress: Optional[Progress] = None,
|
|
594
825
|
pbar_id: Optional[int] = None,
|
|
595
826
|
):
|
|
827
|
+
logger.info("in _a_execute_llm_test_cases")
|
|
596
828
|
pbar_test_case_id = add_pbar(
|
|
597
829
|
progress,
|
|
598
830
|
f" 🎯 Evaluating test case #{count}",
|
|
@@ -616,64 +848,85 @@ async def _a_execute_llm_test_cases(
|
|
|
616
848
|
api_test_case = create_api_test_case(
|
|
617
849
|
test_case=test_case, index=count if not _is_assert_test else None
|
|
618
850
|
)
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
metrics=metrics,
|
|
623
|
-
test_case=test_case,
|
|
624
|
-
cached_test_case=cached_test_case,
|
|
625
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
626
|
-
ignore_errors=ignore_errors,
|
|
627
|
-
show_indicator=show_metrics_indicator,
|
|
628
|
-
pbar_eval_id=pbar_test_case_id,
|
|
629
|
-
progress=progress,
|
|
630
|
-
)
|
|
851
|
+
try:
|
|
852
|
+
new_cached_test_case: CachedTestCase = CachedTestCase()
|
|
853
|
+
test_start_time = time.perf_counter()
|
|
631
854
|
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
855
|
+
await measure_metrics_with_indicator(
|
|
856
|
+
metrics=metrics,
|
|
857
|
+
test_case=test_case,
|
|
858
|
+
cached_test_case=cached_test_case,
|
|
859
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
860
|
+
ignore_errors=ignore_errors,
|
|
861
|
+
show_indicator=show_metrics_indicator,
|
|
862
|
+
pbar_eval_id=pbar_test_case_id,
|
|
863
|
+
progress=progress,
|
|
864
|
+
)
|
|
865
|
+
except asyncio.CancelledError:
|
|
866
|
+
msg = (
|
|
867
|
+
"Timed out/cancelled while evaluating metric. "
|
|
868
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
869
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
870
|
+
)
|
|
871
|
+
for m in metrics:
|
|
872
|
+
if getattr(m, "skipped", False):
|
|
873
|
+
continue
|
|
874
|
+
# If the task never finished and didn't set a terminal state, mark it now
|
|
875
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
876
|
+
m, "error", None
|
|
877
|
+
):
|
|
878
|
+
m.success = False
|
|
879
|
+
m.error = msg
|
|
880
|
+
if not ignore_errors:
|
|
881
|
+
raise
|
|
882
|
+
finally:
|
|
883
|
+
for metric in metrics:
|
|
884
|
+
if metric.skipped:
|
|
885
|
+
continue
|
|
635
886
|
|
|
636
|
-
|
|
637
|
-
|
|
887
|
+
metric_data = create_metric_data(metric)
|
|
888
|
+
api_test_case.update_metric_data(metric_data)
|
|
638
889
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
890
|
+
if metric.error is None:
|
|
891
|
+
cache_metric_data = deepcopy(metric_data)
|
|
892
|
+
cache_metric_data.evaluation_cost = (
|
|
893
|
+
0 # Create new copy and save 0 for cost
|
|
894
|
+
)
|
|
895
|
+
updated_cached_metric_data = CachedMetricData(
|
|
896
|
+
metric_data=cache_metric_data,
|
|
897
|
+
metric_configuration=Cache.create_metric_configuration(
|
|
898
|
+
metric
|
|
899
|
+
),
|
|
900
|
+
)
|
|
901
|
+
new_cached_test_case.cached_metrics_data.append(
|
|
902
|
+
updated_cached_metric_data
|
|
903
|
+
)
|
|
651
904
|
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
905
|
+
test_end_time = time.perf_counter()
|
|
906
|
+
run_duration = test_end_time - test_start_time
|
|
907
|
+
# Quick hack to check if all metrics were from cache
|
|
908
|
+
if run_duration < 1:
|
|
909
|
+
run_duration = 0
|
|
910
|
+
api_test_case.update_run_duration(run_duration)
|
|
911
|
+
|
|
912
|
+
### Update Test Run ###
|
|
913
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
914
|
+
|
|
915
|
+
### Cache Test Run ###
|
|
916
|
+
global_test_run_cache_manager.cache_test_case(
|
|
917
|
+
test_case,
|
|
918
|
+
new_cached_test_case,
|
|
919
|
+
test_run.hyperparameters,
|
|
920
|
+
)
|
|
921
|
+
global_test_run_cache_manager.cache_test_case(
|
|
922
|
+
test_case,
|
|
923
|
+
new_cached_test_case,
|
|
924
|
+
test_run.hyperparameters,
|
|
925
|
+
to_temp=True,
|
|
926
|
+
)
|
|
674
927
|
|
|
675
|
-
|
|
676
|
-
|
|
928
|
+
test_results.append(create_test_result(api_test_case))
|
|
929
|
+
update_pbar(progress, pbar_id)
|
|
677
930
|
|
|
678
931
|
|
|
679
932
|
async def _a_execute_mllm_test_cases(
|
|
@@ -705,31 +958,50 @@ async def _a_execute_mllm_test_cases(
|
|
|
705
958
|
test_case=test_case, index=count if not _is_assert_test else None
|
|
706
959
|
)
|
|
707
960
|
test_start_time = time.perf_counter()
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
961
|
+
try:
|
|
962
|
+
await measure_metrics_with_indicator(
|
|
963
|
+
metrics=metrics,
|
|
964
|
+
test_case=test_case,
|
|
965
|
+
cached_test_case=None,
|
|
966
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
967
|
+
ignore_errors=ignore_errors,
|
|
968
|
+
show_indicator=show_metrics_indicator,
|
|
969
|
+
pbar_eval_id=pbar_test_case_id,
|
|
970
|
+
progress=progress,
|
|
971
|
+
)
|
|
972
|
+
except asyncio.CancelledError:
|
|
973
|
+
msg = (
|
|
974
|
+
"Timed out/cancelled while evaluating metric. "
|
|
975
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
976
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
977
|
+
)
|
|
978
|
+
for m in metrics:
|
|
979
|
+
if getattr(m, "skipped", False):
|
|
980
|
+
continue
|
|
981
|
+
# If the task never finished and didn't set a terminal state, mark it now
|
|
982
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
983
|
+
m, "error", None
|
|
984
|
+
):
|
|
985
|
+
m.success = False
|
|
986
|
+
m.error = msg
|
|
987
|
+
if not ignore_errors:
|
|
988
|
+
raise
|
|
989
|
+
finally:
|
|
990
|
+
for metric in metrics:
|
|
991
|
+
if metric.skipped:
|
|
992
|
+
continue
|
|
721
993
|
|
|
722
|
-
|
|
723
|
-
|
|
994
|
+
metric_data = create_metric_data(metric)
|
|
995
|
+
api_test_case.update_metric_data(metric_data)
|
|
724
996
|
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
997
|
+
test_end_time = time.perf_counter()
|
|
998
|
+
run_duration = test_end_time - test_start_time
|
|
999
|
+
api_test_case.update_run_duration(run_duration)
|
|
728
1000
|
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
1001
|
+
### Update Test Run ###
|
|
1002
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1003
|
+
test_results.append(create_test_result(api_test_case))
|
|
1004
|
+
update_pbar(progress, pbar_id)
|
|
733
1005
|
|
|
734
1006
|
|
|
735
1007
|
async def _a_execute_conversational_test_cases(
|
|
@@ -764,33 +1036,55 @@ async def _a_execute_conversational_test_cases(
|
|
|
764
1036
|
)
|
|
765
1037
|
|
|
766
1038
|
test_start_time = time.perf_counter()
|
|
767
|
-
await measure_metrics_with_indicator(
|
|
768
|
-
metrics=metrics,
|
|
769
|
-
test_case=test_case,
|
|
770
|
-
cached_test_case=None,
|
|
771
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
772
|
-
ignore_errors=ignore_errors,
|
|
773
|
-
show_indicator=show_metrics_indicator,
|
|
774
|
-
pbar_eval_id=pbar_test_case_id,
|
|
775
|
-
progress=progress,
|
|
776
|
-
)
|
|
777
|
-
for metric in metrics:
|
|
778
|
-
if metric.skipped:
|
|
779
|
-
continue
|
|
780
1039
|
|
|
781
|
-
|
|
782
|
-
|
|
1040
|
+
try:
|
|
1041
|
+
await measure_metrics_with_indicator(
|
|
1042
|
+
metrics=metrics,
|
|
1043
|
+
test_case=test_case,
|
|
1044
|
+
cached_test_case=None,
|
|
1045
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
1046
|
+
ignore_errors=ignore_errors,
|
|
1047
|
+
show_indicator=show_metrics_indicator,
|
|
1048
|
+
pbar_eval_id=pbar_test_case_id,
|
|
1049
|
+
progress=progress,
|
|
1050
|
+
)
|
|
783
1051
|
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
1052
|
+
except asyncio.CancelledError:
|
|
1053
|
+
msg = (
|
|
1054
|
+
"Timed out/cancelled while evaluating metric. "
|
|
1055
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1056
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1057
|
+
)
|
|
1058
|
+
for m in metrics:
|
|
1059
|
+
if getattr(m, "skipped", False):
|
|
1060
|
+
continue
|
|
1061
|
+
# If the task never finished and didn't set a terminal state, mark it now
|
|
1062
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
1063
|
+
m, "error", None
|
|
1064
|
+
):
|
|
1065
|
+
m.success = False
|
|
1066
|
+
m.error = msg
|
|
1067
|
+
if not ignore_errors:
|
|
1068
|
+
raise
|
|
1069
|
+
|
|
1070
|
+
finally:
|
|
1071
|
+
for metric in metrics:
|
|
1072
|
+
if metric.skipped:
|
|
1073
|
+
continue
|
|
1074
|
+
|
|
1075
|
+
metric_data = create_metric_data(metric)
|
|
1076
|
+
api_test_case.update_metric_data(metric_data)
|
|
1077
|
+
|
|
1078
|
+
test_end_time = time.perf_counter()
|
|
1079
|
+
if len(metrics) > 0:
|
|
1080
|
+
run_duration = test_end_time - test_start_time
|
|
1081
|
+
api_test_case.update_run_duration(run_duration)
|
|
788
1082
|
|
|
789
|
-
|
|
790
|
-
|
|
1083
|
+
### Update Test Run ###
|
|
1084
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
791
1085
|
|
|
792
|
-
|
|
793
|
-
|
|
1086
|
+
test_results.append(create_test_result(api_test_case))
|
|
1087
|
+
update_pbar(progress, pbar_id)
|
|
794
1088
|
|
|
795
1089
|
|
|
796
1090
|
###########################################
|
|
@@ -814,7 +1108,11 @@ def execute_agentic_test_cases(
|
|
|
814
1108
|
test_run_manager = global_test_run_manager
|
|
815
1109
|
|
|
816
1110
|
test_run_manager.save_to_disk = cache_config.write_cache
|
|
817
|
-
test_run_manager.get_test_run(identifier=identifier)
|
|
1111
|
+
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
1112
|
+
if test_run is None:
|
|
1113
|
+
# Create if not found
|
|
1114
|
+
test_run_manager.create_test_run(identifier=identifier)
|
|
1115
|
+
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
818
1116
|
|
|
819
1117
|
local_trace_manager = trace_manager
|
|
820
1118
|
local_trace_manager.evaluating = True
|
|
@@ -824,244 +1122,526 @@ def execute_agentic_test_cases(
|
|
|
824
1122
|
progress: Optional[Progress] = None,
|
|
825
1123
|
pbar_id: Optional[int] = None,
|
|
826
1124
|
):
|
|
827
|
-
count =
|
|
1125
|
+
count = -1
|
|
828
1126
|
show_metric_indicator = (
|
|
829
1127
|
display_config.show_indicator and not _use_bar_indicator
|
|
830
1128
|
)
|
|
831
1129
|
|
|
832
1130
|
for golden in goldens:
|
|
833
|
-
|
|
834
|
-
count += 1
|
|
835
|
-
total_tags = count_observe_decorators_in_module(
|
|
836
|
-
observed_callback
|
|
837
|
-
)
|
|
838
|
-
pbar_tags_id = add_pbar(
|
|
839
|
-
progress,
|
|
840
|
-
f" ⚡ Invoking observed callback (#{count})",
|
|
841
|
-
total=total_tags,
|
|
842
|
-
)
|
|
843
|
-
|
|
844
|
-
with Observer(
|
|
845
|
-
"custom",
|
|
846
|
-
func_name="Test Wrapper",
|
|
847
|
-
_progress=progress,
|
|
848
|
-
_pbar_callback_id=pbar_tags_id,
|
|
849
|
-
):
|
|
850
|
-
if asyncio.iscoroutinefunction(observed_callback):
|
|
851
|
-
loop = get_or_create_event_loop()
|
|
852
|
-
coro = observed_callback(golden.input)
|
|
853
|
-
loop.run_until_complete(
|
|
854
|
-
asyncio.wait_for(
|
|
855
|
-
coro,
|
|
856
|
-
timeout=_per_task_timeout(),
|
|
857
|
-
)
|
|
858
|
-
)
|
|
859
|
-
else:
|
|
860
|
-
observed_callback(golden.input)
|
|
861
|
-
current_trace: Trace = current_trace_context.get()
|
|
1131
|
+
count += 1
|
|
862
1132
|
|
|
863
|
-
|
|
864
|
-
|
|
1133
|
+
pbar_case_increments = (
|
|
1134
|
+
0 # tracks how many times we advance `pbar_id` for this golden
|
|
1135
|
+
)
|
|
1136
|
+
emitted_trace = set()
|
|
1137
|
+
current_trace: Optional[Trace] = None
|
|
1138
|
+
trace_api = None
|
|
1139
|
+
api_test_case = None
|
|
1140
|
+
test_case = None
|
|
1141
|
+
|
|
1142
|
+
def _run_golden():
|
|
1143
|
+
nonlocal current_trace, trace_api, api_test_case, test_case, pbar_case_increments
|
|
1144
|
+
# keep the evaluation context inside the timed function
|
|
1145
|
+
with capture_evaluation_run("golden"):
|
|
1146
|
+
total_tags = count_observe_decorators_in_module(
|
|
1147
|
+
observed_callback
|
|
1148
|
+
)
|
|
1149
|
+
pbar_tags_id = add_pbar(
|
|
1150
|
+
progress,
|
|
1151
|
+
f" ⚡ Invoking observed callback (#{count})",
|
|
1152
|
+
total=total_tags,
|
|
1153
|
+
)
|
|
865
1154
|
|
|
866
|
-
|
|
867
|
-
|
|
1155
|
+
with Observer(
|
|
1156
|
+
"custom",
|
|
1157
|
+
func_name="Test Wrapper",
|
|
1158
|
+
_progress=progress,
|
|
1159
|
+
_pbar_callback_id=pbar_tags_id,
|
|
1160
|
+
):
|
|
1161
|
+
if asyncio.iscoroutinefunction(observed_callback):
|
|
1162
|
+
loop = get_or_create_event_loop()
|
|
1163
|
+
coro = observed_callback(golden.input)
|
|
1164
|
+
loop.run_until_complete(
|
|
1165
|
+
_await_with_outer_deadline(
|
|
1166
|
+
coro,
|
|
1167
|
+
timeout=_per_task_timeout(),
|
|
1168
|
+
)
|
|
1169
|
+
)
|
|
1170
|
+
else:
|
|
1171
|
+
observed_callback(golden.input)
|
|
868
1172
|
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
input=golden.input,
|
|
872
|
-
actual_output=(
|
|
873
|
-
str(current_trace.output)
|
|
874
|
-
if current_trace.output is not None
|
|
875
|
-
else None
|
|
876
|
-
),
|
|
877
|
-
expected_output=current_trace.expected_output,
|
|
878
|
-
context=current_trace.context,
|
|
879
|
-
retrieval_context=current_trace.retrieval_context,
|
|
880
|
-
additional_metadata=golden.additional_metadata,
|
|
881
|
-
tools_called=current_trace.tools_called,
|
|
882
|
-
expected_tools=current_trace.expected_tools,
|
|
883
|
-
comments=golden.comments,
|
|
884
|
-
name=golden.name,
|
|
885
|
-
_dataset_alias=golden._dataset_alias,
|
|
886
|
-
_dataset_id=golden._dataset_id,
|
|
887
|
-
)
|
|
888
|
-
api_test_case = create_api_test_case(
|
|
889
|
-
test_case=test_case,
|
|
890
|
-
trace=trace_api,
|
|
891
|
-
index=count if not _is_assert_test else None,
|
|
892
|
-
)
|
|
1173
|
+
# we have a trace now
|
|
1174
|
+
current_trace = current_trace_context.get()
|
|
893
1175
|
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
1176
|
+
update_pbar(progress, pbar_tags_id, advance=total_tags)
|
|
1177
|
+
update_pbar(progress, pbar_id)
|
|
1178
|
+
pbar_case_increments += 1
|
|
1179
|
+
|
|
1180
|
+
# Create empty trace api for llm api test case
|
|
1181
|
+
trace_api = create_api_trace(current_trace, golden)
|
|
1182
|
+
|
|
1183
|
+
# Build the test case and api test case
|
|
1184
|
+
test_case = LLMTestCase(
|
|
1185
|
+
input=golden.input,
|
|
1186
|
+
actual_output=(
|
|
1187
|
+
str(current_trace.output)
|
|
1188
|
+
if current_trace
|
|
1189
|
+
and current_trace.output is not None
|
|
1190
|
+
else None
|
|
1191
|
+
),
|
|
1192
|
+
expected_output=(
|
|
1193
|
+
current_trace.expected_output
|
|
1194
|
+
if current_trace
|
|
1195
|
+
else None
|
|
1196
|
+
),
|
|
1197
|
+
context=(
|
|
1198
|
+
current_trace.context if current_trace else None
|
|
1199
|
+
),
|
|
1200
|
+
retrieval_context=(
|
|
1201
|
+
current_trace.retrieval_context
|
|
1202
|
+
if current_trace
|
|
1203
|
+
else None
|
|
1204
|
+
),
|
|
1205
|
+
additional_metadata=golden.additional_metadata,
|
|
1206
|
+
tools_called=(
|
|
1207
|
+
current_trace.tools_called
|
|
1208
|
+
if current_trace
|
|
1209
|
+
else None
|
|
1210
|
+
),
|
|
1211
|
+
expected_tools=(
|
|
1212
|
+
current_trace.expected_tools
|
|
1213
|
+
if current_trace
|
|
1214
|
+
else None
|
|
1215
|
+
),
|
|
1216
|
+
comments=golden.comments,
|
|
1217
|
+
name=golden.name,
|
|
1218
|
+
_dataset_alias=golden._dataset_alias,
|
|
1219
|
+
_dataset_id=golden._dataset_id,
|
|
904
1220
|
)
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
log_prompt(span, test_run_manager)
|
|
910
|
-
elif isinstance(span, RetrieverSpan):
|
|
911
|
-
trace_api.retriever_spans.append(api_span)
|
|
912
|
-
elif isinstance(span, ToolSpan):
|
|
913
|
-
trace_api.tool_spans.append(api_span)
|
|
914
|
-
else:
|
|
915
|
-
trace_api.base_spans.append(api_span)
|
|
916
|
-
|
|
917
|
-
for child in span.children:
|
|
918
|
-
dfs(child, progress, pbar_eval_id)
|
|
919
|
-
|
|
920
|
-
if span.metrics is None:
|
|
921
|
-
return
|
|
922
|
-
has_task_completion = any(
|
|
923
|
-
isinstance(metric, TaskCompletionMetric)
|
|
924
|
-
for metric in span.metrics
|
|
1221
|
+
api_test_case = create_api_test_case(
|
|
1222
|
+
test_case=test_case,
|
|
1223
|
+
trace=trace_api,
|
|
1224
|
+
index=count if not _is_assert_test else None,
|
|
925
1225
|
)
|
|
926
1226
|
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
)
|
|
936
|
-
expected_output=span.expected_output,
|
|
937
|
-
context=span.context,
|
|
938
|
-
retrieval_context=span.retrieval_context,
|
|
939
|
-
tools_called=span.tools_called,
|
|
940
|
-
expected_tools=span.expected_tools,
|
|
1227
|
+
# DFS and trace metric evaluation
|
|
1228
|
+
def dfs(
|
|
1229
|
+
span: BaseSpan,
|
|
1230
|
+
progress: Optional[Progress] = None,
|
|
1231
|
+
pbar_eval_id: Optional[int] = None,
|
|
1232
|
+
):
|
|
1233
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
1234
|
+
api_span: BaseApiSpan = (
|
|
1235
|
+
trace_manager._convert_span_to_api_span(span)
|
|
941
1236
|
)
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1237
|
+
|
|
1238
|
+
if isinstance(span, AgentSpan):
|
|
1239
|
+
trace_api.agent_spans.append(api_span)
|
|
1240
|
+
elif isinstance(span, LlmSpan):
|
|
1241
|
+
trace_api.llm_spans.append(api_span)
|
|
1242
|
+
log_prompt(span, test_run_manager)
|
|
1243
|
+
elif isinstance(span, RetrieverSpan):
|
|
1244
|
+
trace_api.retriever_spans.append(api_span)
|
|
1245
|
+
elif isinstance(span, ToolSpan):
|
|
1246
|
+
trace_api.tool_spans.append(api_span)
|
|
1247
|
+
else:
|
|
1248
|
+
trace_api.base_spans.append(api_span)
|
|
1249
|
+
|
|
1250
|
+
if _skip_metrics_for_error(
|
|
1251
|
+
span=span, trace=current_trace
|
|
1252
|
+
):
|
|
1253
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1254
|
+
api_span.error = span.error or _trace_error(
|
|
1255
|
+
current_trace
|
|
1256
|
+
)
|
|
1257
|
+
if progress and pbar_eval_id is not None:
|
|
1258
|
+
update_pbar(
|
|
1259
|
+
progress,
|
|
1260
|
+
pbar_eval_id,
|
|
1261
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1262
|
+
)
|
|
1263
|
+
return
|
|
1264
|
+
|
|
1265
|
+
# evaluate children first
|
|
1266
|
+
for child in span.children:
|
|
1267
|
+
dfs(child, progress, pbar_eval_id)
|
|
1268
|
+
|
|
1269
|
+
# If there are no metrics, then there is nothing to do on this span.
|
|
1270
|
+
if not metrics:
|
|
1271
|
+
return
|
|
1272
|
+
|
|
1273
|
+
has_task_completion = any(
|
|
1274
|
+
isinstance(metric, TaskCompletionMetric)
|
|
1275
|
+
for metric in metrics
|
|
945
1276
|
)
|
|
946
1277
|
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
llm_test_case = LLMTestCase(input="None")
|
|
951
|
-
llm_test_case._trace_dict = (
|
|
952
|
-
trace_manager.create_nested_spans_dict(span)
|
|
1278
|
+
requires_trace = any(
|
|
1279
|
+
getattr(metric, "requires_trace", False)
|
|
1280
|
+
for metric in metrics
|
|
953
1281
|
)
|
|
954
1282
|
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
1283
|
+
llm_test_case = None
|
|
1284
|
+
if span.input is not None:
|
|
1285
|
+
llm_test_case = LLMTestCase(
|
|
1286
|
+
input=str(span.input),
|
|
1287
|
+
actual_output=(
|
|
1288
|
+
str(span.output)
|
|
1289
|
+
if span.output is not None
|
|
1290
|
+
else None
|
|
1291
|
+
),
|
|
1292
|
+
expected_output=span.expected_output,
|
|
1293
|
+
context=span.context,
|
|
1294
|
+
retrieval_context=span.retrieval_context,
|
|
1295
|
+
tools_called=span.tools_called,
|
|
1296
|
+
expected_tools=span.expected_tools,
|
|
1297
|
+
)
|
|
962
1298
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
1299
|
+
# If any metric needs a trace tree or a completion verdict, attach the trace
|
|
1300
|
+
if has_task_completion or requires_trace:
|
|
1301
|
+
if llm_test_case is None:
|
|
1302
|
+
llm_test_case = LLMTestCase(input="None")
|
|
1303
|
+
llm_test_case._trace_dict = (
|
|
1304
|
+
trace_manager.create_nested_spans_dict(span)
|
|
1305
|
+
)
|
|
1306
|
+
else:
|
|
1307
|
+
# Without a test case we cannot evaluate span metrics
|
|
1308
|
+
if llm_test_case is None:
|
|
1309
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1310
|
+
api_span.error = format_error_text(
|
|
1311
|
+
DeepEvalError(
|
|
1312
|
+
"Span has metrics but no LLMTestCase. "
|
|
1313
|
+
"Are you sure you called `update_current_span()`?"
|
|
1314
|
+
)
|
|
1315
|
+
)
|
|
1316
|
+
if progress and pbar_eval_id is not None:
|
|
1317
|
+
update_pbar(
|
|
1318
|
+
progress,
|
|
1319
|
+
pbar_eval_id,
|
|
1320
|
+
advance=count_metrics_in_span_subtree(
|
|
1321
|
+
span
|
|
1322
|
+
),
|
|
1323
|
+
)
|
|
1324
|
+
return
|
|
1325
|
+
|
|
1326
|
+
# Preparing metric calculation
|
|
1327
|
+
api_span.metrics_data = []
|
|
1328
|
+
for metric in metrics:
|
|
1329
|
+
metric.skipped = False
|
|
1330
|
+
metric.error = None
|
|
1331
|
+
if display_config.verbose_mode is not None:
|
|
1332
|
+
metric.verbose_mode = (
|
|
1333
|
+
display_config.verbose_mode
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
# Metric calculation
|
|
1337
|
+
for metric in metrics:
|
|
1338
|
+
res = _execute_metric(
|
|
1339
|
+
metric=metric,
|
|
1340
|
+
test_case=llm_test_case,
|
|
1341
|
+
show_metric_indicator=show_metric_indicator,
|
|
1342
|
+
in_component=True,
|
|
1343
|
+
error_config=error_config,
|
|
1344
|
+
)
|
|
1345
|
+
if res == "skip":
|
|
1346
|
+
continue
|
|
1347
|
+
metric_data = create_metric_data(metric)
|
|
1348
|
+
api_span.metrics_data.append(metric_data)
|
|
1349
|
+
api_test_case.update_status(metric_data.success)
|
|
1350
|
+
update_pbar(progress, pbar_eval_id)
|
|
979
1351
|
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
1352
|
+
trace_level_metrics_count = (
|
|
1353
|
+
len(current_trace.metrics)
|
|
1354
|
+
if current_trace and current_trace.metrics
|
|
1355
|
+
else 0
|
|
1356
|
+
)
|
|
1357
|
+
pbar_eval_id = add_pbar(
|
|
1358
|
+
progress,
|
|
1359
|
+
f" 🎯 Evaluating component(s) (#{count})",
|
|
1360
|
+
total=count_metrics_in_trace(trace=current_trace)
|
|
1361
|
+
+ trace_level_metrics_count,
|
|
1362
|
+
)
|
|
1363
|
+
|
|
1364
|
+
start_time = time.perf_counter()
|
|
1365
|
+
|
|
1366
|
+
skip_metrics_for_this_golden = False
|
|
1367
|
+
if _skip_metrics_for_error(trace=current_trace):
|
|
1368
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1369
|
+
if progress and pbar_eval_id is not None:
|
|
1370
|
+
update_pbar(
|
|
1371
|
+
progress,
|
|
1372
|
+
pbar_eval_id,
|
|
1373
|
+
advance=count_total_metrics_for_trace(
|
|
1374
|
+
current_trace
|
|
1375
|
+
),
|
|
1376
|
+
)
|
|
1377
|
+
else:
|
|
1378
|
+
if current_trace and current_trace.metrics:
|
|
1379
|
+
has_task_completion = any(
|
|
1380
|
+
isinstance(metric, TaskCompletionMetric)
|
|
1381
|
+
for metric in current_trace.metrics
|
|
1382
|
+
)
|
|
1383
|
+
requires_trace = any(
|
|
1384
|
+
getattr(metric, "requires_trace", False)
|
|
1385
|
+
for metric in current_trace.metrics
|
|
1386
|
+
)
|
|
1387
|
+
llm_test_case = None
|
|
1388
|
+
if current_trace.input:
|
|
1389
|
+
llm_test_case = LLMTestCase(
|
|
1390
|
+
input=str(current_trace.input),
|
|
1391
|
+
actual_output=(
|
|
1392
|
+
str(current_trace.output)
|
|
1393
|
+
if current_trace.output is not None
|
|
1394
|
+
else None
|
|
1395
|
+
),
|
|
1396
|
+
expected_output=current_trace.expected_output,
|
|
1397
|
+
context=current_trace.context,
|
|
1398
|
+
retrieval_context=current_trace.retrieval_context,
|
|
1399
|
+
tools_called=current_trace.tools_called,
|
|
1400
|
+
expected_tools=current_trace.expected_tools,
|
|
1401
|
+
)
|
|
1402
|
+
if has_task_completion or requires_trace:
|
|
1403
|
+
if llm_test_case is None:
|
|
1404
|
+
llm_test_case = LLMTestCase(input="None")
|
|
1405
|
+
llm_test_case._trace_dict = (
|
|
1406
|
+
trace_manager.create_nested_spans_dict(
|
|
1407
|
+
current_trace.root_spans[0]
|
|
1408
|
+
)
|
|
1409
|
+
)
|
|
1410
|
+
else:
|
|
1411
|
+
if llm_test_case is None:
|
|
1412
|
+
current_trace.status = (
|
|
1413
|
+
TraceSpanStatus.ERRORED
|
|
1414
|
+
)
|
|
1415
|
+
trace_api.status = (
|
|
1416
|
+
TraceSpanApiStatus.ERRORED
|
|
1417
|
+
)
|
|
1418
|
+
if current_trace.root_spans:
|
|
1419
|
+
current_trace.root_spans[0].status = (
|
|
1420
|
+
TraceSpanStatus.ERRORED
|
|
1421
|
+
)
|
|
1422
|
+
current_trace.root_spans[0].error = (
|
|
1423
|
+
format_error_text(
|
|
1424
|
+
DeepEvalError(
|
|
1425
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
1426
|
+
"Are you sure you called `update_current_trace()`?"
|
|
1427
|
+
)
|
|
1428
|
+
)
|
|
1429
|
+
)
|
|
1430
|
+
if progress and pbar_eval_id is not None:
|
|
1431
|
+
update_pbar(
|
|
1432
|
+
progress,
|
|
1433
|
+
pbar_eval_id,
|
|
1434
|
+
advance=count_total_metrics_for_trace(
|
|
1435
|
+
current_trace
|
|
1436
|
+
),
|
|
1437
|
+
)
|
|
1438
|
+
skip_metrics_for_this_golden = True
|
|
1439
|
+
|
|
1440
|
+
if not skip_metrics_for_this_golden:
|
|
1441
|
+
for metric in current_trace.metrics:
|
|
1442
|
+
metric.skipped = False
|
|
1443
|
+
metric.error = None
|
|
1444
|
+
if display_config.verbose_mode is not None:
|
|
1445
|
+
metric.verbose_mode = (
|
|
1446
|
+
display_config.verbose_mode
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
trace_api.metrics_data = []
|
|
1450
|
+
for metric in current_trace.metrics:
|
|
1451
|
+
res = _execute_metric(
|
|
1452
|
+
metric=metric,
|
|
1453
|
+
test_case=llm_test_case,
|
|
1454
|
+
show_metric_indicator=show_metric_indicator,
|
|
1455
|
+
in_component=True,
|
|
1456
|
+
error_config=error_config,
|
|
1457
|
+
)
|
|
1458
|
+
if res == "skip":
|
|
1459
|
+
continue
|
|
1460
|
+
|
|
1461
|
+
if not metric.skipped:
|
|
1462
|
+
metric_data = create_metric_data(metric)
|
|
1463
|
+
trace_api.metrics_data.append(
|
|
1464
|
+
metric_data
|
|
1465
|
+
)
|
|
1466
|
+
api_test_case.update_metric_data(
|
|
1467
|
+
metric_data
|
|
1468
|
+
)
|
|
1469
|
+
api_test_case.update_status(
|
|
1470
|
+
metric_data.success
|
|
1471
|
+
)
|
|
1472
|
+
emitted_trace.add(id(metric))
|
|
1473
|
+
update_pbar(progress, pbar_eval_id)
|
|
1474
|
+
|
|
1475
|
+
# handle span metrics
|
|
1476
|
+
dfs(
|
|
1477
|
+
current_trace.root_spans[0],
|
|
1478
|
+
progress,
|
|
1479
|
+
pbar_eval_id,
|
|
1480
|
+
)
|
|
1481
|
+
|
|
1482
|
+
# TODO: Do I need this block, or is it duplicated in finally?
|
|
1483
|
+
end_time = time.perf_counter()
|
|
1484
|
+
run_duration = end_time - start_time
|
|
1485
|
+
api_test_case.update_run_duration(run_duration)
|
|
1486
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1487
|
+
test_results.append(create_test_result(api_test_case))
|
|
1488
|
+
test_results.extend(extract_trace_test_results(trace_api))
|
|
1489
|
+
update_pbar(progress, pbar_id)
|
|
1490
|
+
pbar_case_increments += 1
|
|
1491
|
+
|
|
1492
|
+
# run the golden with a timeout
|
|
1493
|
+
start_time = time.perf_counter()
|
|
1494
|
+
deadline = _per_task_timeout()
|
|
1495
|
+
|
|
1496
|
+
try:
|
|
1497
|
+
run_sync_with_timeout(_run_golden, deadline)
|
|
1498
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
1499
|
+
# mark any not yet finished trace level and span level metrics as timed out.
|
|
1500
|
+
msg = (
|
|
1501
|
+
f"Timed out after {deadline:.2f}s while executing agentic test case. "
|
|
1502
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1503
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
988
1504
|
)
|
|
989
1505
|
|
|
990
|
-
|
|
1506
|
+
if current_trace is not None:
|
|
1507
|
+
# Trace-level metrics
|
|
1508
|
+
if getattr(current_trace, "metrics", None):
|
|
1509
|
+
for m in current_trace.metrics:
|
|
1510
|
+
if getattr(m, "skipped", False):
|
|
1511
|
+
continue
|
|
1512
|
+
# if already has a terminal state, leave it alone
|
|
1513
|
+
if getattr(
|
|
1514
|
+
m, "success", None
|
|
1515
|
+
) is not None or getattr(m, "error", None):
|
|
1516
|
+
continue
|
|
1517
|
+
m.success = False
|
|
1518
|
+
m.error = msg
|
|
1519
|
+
|
|
1520
|
+
# span level metrics, walk the tree
|
|
1521
|
+
def _walk(span):
|
|
1522
|
+
for child in getattr(span, "children", []) or []:
|
|
1523
|
+
_walk(child)
|
|
1524
|
+
for m in list(getattr(span, "metrics", []) or []):
|
|
1525
|
+
if getattr(m, "skipped", False):
|
|
1526
|
+
continue
|
|
1527
|
+
if getattr(
|
|
1528
|
+
m, "success", None
|
|
1529
|
+
) is not None or getattr(m, "error", None):
|
|
1530
|
+
continue
|
|
1531
|
+
m.success = False
|
|
1532
|
+
m.error = msg
|
|
991
1533
|
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
has_task_completion = any(
|
|
995
|
-
isinstance(metric, TaskCompletionMetric)
|
|
996
|
-
for metric in current_trace.metrics
|
|
997
|
-
)
|
|
1534
|
+
for root in getattr(current_trace, "root_spans", []) or []:
|
|
1535
|
+
_walk(root)
|
|
998
1536
|
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1537
|
+
# raise if we are not ignoring errors
|
|
1538
|
+
if not error_config.ignore_errors:
|
|
1539
|
+
raise
|
|
1540
|
+
|
|
1541
|
+
finally:
|
|
1542
|
+
try:
|
|
1543
|
+
# Ensure we have an api_test_case to attach results to.
|
|
1544
|
+
if api_test_case is None:
|
|
1545
|
+
# build a minimal test_case
|
|
1546
|
+
if test_case is None:
|
|
1547
|
+
out = (
|
|
1004
1548
|
str(current_trace.output)
|
|
1005
|
-
if
|
|
1549
|
+
if (
|
|
1550
|
+
current_trace is not None
|
|
1551
|
+
and current_trace.output is not None
|
|
1552
|
+
)
|
|
1006
1553
|
else None
|
|
1007
|
-
),
|
|
1008
|
-
expected_output=current_trace.expected_output,
|
|
1009
|
-
context=current_trace.context,
|
|
1010
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1011
|
-
tools_called=current_trace.tools_called,
|
|
1012
|
-
expected_tools=current_trace.expected_tools,
|
|
1013
|
-
)
|
|
1014
|
-
if llm_test_case is None and not has_task_completion:
|
|
1015
|
-
raise ValueError(
|
|
1016
|
-
"Unable to run metrics on trace without LLMTestCase. Are you sure you called `update_current_trace()`?"
|
|
1017
|
-
)
|
|
1018
|
-
|
|
1019
|
-
if has_task_completion:
|
|
1020
|
-
if llm_test_case is None:
|
|
1021
|
-
llm_test_case = LLMTestCase(input="None")
|
|
1022
|
-
llm_test_case._trace_dict = (
|
|
1023
|
-
trace_manager.create_nested_spans_dict(
|
|
1024
|
-
current_trace.root_spans[0]
|
|
1025
1554
|
)
|
|
1026
|
-
|
|
1555
|
+
test_case = LLMTestCase(
|
|
1556
|
+
input=golden.input,
|
|
1557
|
+
actual_output=out,
|
|
1558
|
+
expected_output=(
|
|
1559
|
+
current_trace.expected_output
|
|
1560
|
+
if current_trace
|
|
1561
|
+
else None
|
|
1562
|
+
),
|
|
1563
|
+
context=(
|
|
1564
|
+
current_trace.context
|
|
1565
|
+
if current_trace
|
|
1566
|
+
else None
|
|
1567
|
+
),
|
|
1568
|
+
retrieval_context=(
|
|
1569
|
+
current_trace.retrieval_context
|
|
1570
|
+
if current_trace
|
|
1571
|
+
else None
|
|
1572
|
+
),
|
|
1573
|
+
additional_metadata=golden.additional_metadata,
|
|
1574
|
+
tools_called=(
|
|
1575
|
+
current_trace.tools_called
|
|
1576
|
+
if current_trace
|
|
1577
|
+
else None
|
|
1578
|
+
),
|
|
1579
|
+
expected_tools=(
|
|
1580
|
+
current_trace.expected_tools
|
|
1581
|
+
if current_trace
|
|
1582
|
+
else None
|
|
1583
|
+
),
|
|
1584
|
+
comments=golden.comments,
|
|
1585
|
+
name=golden.name,
|
|
1586
|
+
_dataset_alias=golden._dataset_alias,
|
|
1587
|
+
_dataset_id=golden._dataset_id,
|
|
1588
|
+
)
|
|
1027
1589
|
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
if display_config.verbose_mode is not None:
|
|
1032
|
-
metric.verbose_mode = display_config.verbose_mode
|
|
1590
|
+
# Create a trace API if we have a trace
|
|
1591
|
+
if trace_api is None and current_trace is not None:
|
|
1592
|
+
trace_api = create_api_trace(current_trace, golden)
|
|
1033
1593
|
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
test_case=llm_test_case,
|
|
1039
|
-
show_metric_indicator=show_metric_indicator,
|
|
1040
|
-
in_component=True,
|
|
1041
|
-
error_config=error_config,
|
|
1594
|
+
api_test_case = create_api_test_case(
|
|
1595
|
+
test_case=test_case,
|
|
1596
|
+
trace=trace_api,
|
|
1597
|
+
index=count if not _is_assert_test else None,
|
|
1042
1598
|
)
|
|
1043
|
-
if res == "skip":
|
|
1044
|
-
continue
|
|
1045
1599
|
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1600
|
+
if test_run is not None:
|
|
1601
|
+
test_run_manager.set_test_run(test_run)
|
|
1602
|
+
|
|
1603
|
+
if api_test_case.success is None:
|
|
1604
|
+
api_test_case.update_status(False)
|
|
1605
|
+
|
|
1606
|
+
# try to update metric data
|
|
1607
|
+
if current_trace is not None:
|
|
1608
|
+
if current_trace.metrics:
|
|
1609
|
+
for m in current_trace.metrics:
|
|
1610
|
+
if getattr(m, "skipped", False):
|
|
1611
|
+
continue
|
|
1612
|
+
if id(m) in emitted_trace:
|
|
1613
|
+
continue
|
|
1614
|
+
api_test_case.update_metric_data(
|
|
1615
|
+
create_metric_data(m)
|
|
1616
|
+
)
|
|
1617
|
+
|
|
1618
|
+
# Finalize duration and persist
|
|
1619
|
+
elapsed = time.perf_counter() - start_time
|
|
1620
|
+
api_test_case.update_run_duration(
|
|
1621
|
+
elapsed if elapsed >= 0 else deadline
|
|
1622
|
+
)
|
|
1623
|
+
|
|
1624
|
+
if (
|
|
1625
|
+
api_test_case.metrics_data == []
|
|
1626
|
+
and api_test_case.trace is None
|
|
1627
|
+
):
|
|
1628
|
+
api_test_case.metrics_data = None
|
|
1052
1629
|
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1630
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1631
|
+
test_results.append(create_test_result(api_test_case))
|
|
1632
|
+
|
|
1633
|
+
if trace_api is not None:
|
|
1634
|
+
test_results.extend(
|
|
1635
|
+
extract_trace_test_results(trace_api)
|
|
1636
|
+
)
|
|
1057
1637
|
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
test_results.append(create_test_result(api_test_case))
|
|
1062
|
-
test_results.extend(extract_trace_test_results(trace_api))
|
|
1638
|
+
missing = 2 - pbar_case_increments
|
|
1639
|
+
if missing > 0:
|
|
1640
|
+
update_pbar(progress, pbar_id, advance=missing)
|
|
1063
1641
|
|
|
1064
|
-
|
|
1642
|
+
finally:
|
|
1643
|
+
# nothing to clean here, but keep symmetry with other paths
|
|
1644
|
+
pass
|
|
1065
1645
|
|
|
1066
1646
|
if display_config.show_indicator and _use_bar_indicator:
|
|
1067
1647
|
progress = Progress(
|
|
@@ -1102,7 +1682,10 @@ async def a_execute_agentic_test_cases(
|
|
|
1102
1682
|
|
|
1103
1683
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
1104
1684
|
async with semaphore:
|
|
1105
|
-
|
|
1685
|
+
timeout = _per_task_timeout()
|
|
1686
|
+
return await _await_with_outer_deadline(
|
|
1687
|
+
func, *args, timeout=timeout, **kwargs
|
|
1688
|
+
)
|
|
1106
1689
|
|
|
1107
1690
|
test_run_manager = global_test_run_manager
|
|
1108
1691
|
test_run_manager.save_to_disk = cache_config.write_cache
|
|
@@ -1149,7 +1732,19 @@ async def a_execute_agentic_test_cases(
|
|
|
1149
1732
|
tasks.append(asyncio.create_task(task))
|
|
1150
1733
|
await asyncio.sleep(async_config.throttle_value)
|
|
1151
1734
|
|
|
1152
|
-
|
|
1735
|
+
try:
|
|
1736
|
+
await asyncio.wait_for(
|
|
1737
|
+
asyncio.gather(*tasks),
|
|
1738
|
+
timeout=_gather_timeout(),
|
|
1739
|
+
)
|
|
1740
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
1741
|
+
# Cancel any still-pending tasks and drain them
|
|
1742
|
+
for t in tasks:
|
|
1743
|
+
if not t.done():
|
|
1744
|
+
t.cancel()
|
|
1745
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
1746
|
+
raise
|
|
1747
|
+
|
|
1153
1748
|
else:
|
|
1154
1749
|
for golden in goldens:
|
|
1155
1750
|
with capture_evaluation_run("golden"):
|
|
@@ -1194,93 +1789,89 @@ async def _a_execute_agentic_test_case(
|
|
|
1194
1789
|
progress: Optional[Progress] = None,
|
|
1195
1790
|
pbar_id: Optional[int] = None,
|
|
1196
1791
|
):
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1792
|
+
test_start_time = time.perf_counter()
|
|
1793
|
+
current_trace = None
|
|
1794
|
+
trace_api = None
|
|
1795
|
+
test_case = None
|
|
1796
|
+
api_test_case = None
|
|
1797
|
+
try:
|
|
1798
|
+
if observed_callback:
|
|
1799
|
+
total_tags = count_observe_decorators_in_module(observed_callback)
|
|
1800
|
+
pbar_tags_id = add_pbar(
|
|
1801
|
+
progress,
|
|
1802
|
+
f" ⚡ Invoking observed callback (#{count})",
|
|
1803
|
+
total=total_tags,
|
|
1804
|
+
)
|
|
1204
1805
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1806
|
+
# Call callback and extract trace
|
|
1807
|
+
with Observer(
|
|
1808
|
+
"custom",
|
|
1809
|
+
func_name="Test Wrapper",
|
|
1810
|
+
_progress=progress,
|
|
1811
|
+
_pbar_callback_id=pbar_tags_id,
|
|
1812
|
+
):
|
|
1813
|
+
# get current_trace right away, we need it even if cancelled
|
|
1814
|
+
current_trace: Trace = current_trace_context.get()
|
|
1815
|
+
if asyncio.iscoroutinefunction(observed_callback):
|
|
1816
|
+
await _await_with_outer_deadline(
|
|
1817
|
+
observed_callback,
|
|
1818
|
+
golden.input,
|
|
1819
|
+
timeout=_per_task_timeout(),
|
|
1820
|
+
)
|
|
1821
|
+
else:
|
|
1822
|
+
observed_callback(golden.input)
|
|
1220
1823
|
|
|
1221
|
-
|
|
1222
|
-
|
|
1824
|
+
update_pbar(progress, pbar_tags_id, advance=total_tags)
|
|
1825
|
+
update_pbar(progress, pbar_id)
|
|
1223
1826
|
|
|
1224
|
-
|
|
1225
|
-
|
|
1827
|
+
elif trace:
|
|
1828
|
+
current_trace = trace
|
|
1226
1829
|
|
|
1227
|
-
|
|
1228
|
-
current_trace.metrics = trace_metrics
|
|
1830
|
+
trace_level_metrics_count = 0
|
|
1229
1831
|
|
|
1230
|
-
|
|
1231
|
-
|
|
1832
|
+
if trace_metrics:
|
|
1833
|
+
current_trace.metrics = trace_metrics
|
|
1232
1834
|
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
)
|
|
1835
|
+
# run evals through DFS
|
|
1836
|
+
trace_api = create_api_trace(trace=current_trace, golden=golden)
|
|
1236
1837
|
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
total=count_metrics_in_trace(trace=current_trace)
|
|
1241
|
-
+ trace_level_metrics_count,
|
|
1242
|
-
)
|
|
1838
|
+
trace_level_metrics_count = (
|
|
1839
|
+
len(current_trace.metrics) if current_trace.metrics else 0
|
|
1840
|
+
)
|
|
1243
1841
|
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
),
|
|
1251
|
-
expected_output=current_trace.expected_output,
|
|
1252
|
-
context=current_trace.context,
|
|
1253
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1254
|
-
tools_called=current_trace.tools_called,
|
|
1255
|
-
expected_tools=current_trace.expected_tools,
|
|
1256
|
-
additional_metadata=golden.additional_metadata,
|
|
1257
|
-
comments=golden.comments,
|
|
1258
|
-
name=golden.name,
|
|
1259
|
-
_dataset_alias=golden._dataset_alias,
|
|
1260
|
-
_dataset_id=golden._dataset_id,
|
|
1261
|
-
)
|
|
1262
|
-
api_test_case = create_api_test_case(
|
|
1263
|
-
test_case=test_case,
|
|
1264
|
-
trace=trace_api,
|
|
1265
|
-
index=count if not _is_assert_test else None,
|
|
1266
|
-
)
|
|
1842
|
+
pbar_eval_id = add_pbar(
|
|
1843
|
+
progress,
|
|
1844
|
+
f" 🎯 Evaluating component(s) (#{count})",
|
|
1845
|
+
total=count_metrics_in_trace(trace=current_trace)
|
|
1846
|
+
+ trace_level_metrics_count,
|
|
1847
|
+
)
|
|
1267
1848
|
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1849
|
+
test_case = LLMTestCase(
|
|
1850
|
+
input=golden.input,
|
|
1851
|
+
actual_output=(
|
|
1852
|
+
str(current_trace.output)
|
|
1853
|
+
if current_trace.output is not None
|
|
1854
|
+
else None
|
|
1855
|
+
),
|
|
1856
|
+
expected_output=current_trace.expected_output,
|
|
1857
|
+
context=current_trace.context,
|
|
1858
|
+
retrieval_context=current_trace.retrieval_context,
|
|
1859
|
+
tools_called=current_trace.tools_called,
|
|
1860
|
+
expected_tools=current_trace.expected_tools,
|
|
1861
|
+
additional_metadata=golden.additional_metadata,
|
|
1862
|
+
comments=golden.comments,
|
|
1863
|
+
name=golden.name,
|
|
1864
|
+
_dataset_alias=golden._dataset_alias,
|
|
1865
|
+
_dataset_id=golden._dataset_id,
|
|
1866
|
+
)
|
|
1867
|
+
api_test_case = create_api_test_case(
|
|
1868
|
+
test_case=test_case,
|
|
1869
|
+
trace=trace_api,
|
|
1870
|
+
index=count if not _is_assert_test else None,
|
|
1871
|
+
)
|
|
1280
1872
|
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
span=span,
|
|
1873
|
+
await _a_execute_trace_test_case(
|
|
1874
|
+
trace=current_trace,
|
|
1284
1875
|
trace_api=trace_api,
|
|
1285
1876
|
api_test_case=api_test_case,
|
|
1286
1877
|
ignore_errors=ignore_errors,
|
|
@@ -1289,39 +1880,155 @@ async def _a_execute_agentic_test_case(
|
|
|
1289
1880
|
verbose_mode=verbose_mode,
|
|
1290
1881
|
progress=progress,
|
|
1291
1882
|
pbar_eval_id=pbar_eval_id,
|
|
1292
|
-
test_run_manager=test_run_manager,
|
|
1293
1883
|
_use_bar_indicator=_use_bar_indicator,
|
|
1294
1884
|
)
|
|
1295
|
-
child_tasks = [dfs(child) for child in span.children]
|
|
1296
|
-
if child_tasks:
|
|
1297
|
-
await asyncio.gather(*child_tasks)
|
|
1298
1885
|
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1886
|
+
async def dfs(trace: Trace, span: BaseSpan):
|
|
1887
|
+
await _a_execute_span_test_case(
|
|
1888
|
+
span=span,
|
|
1889
|
+
current_trace=trace,
|
|
1890
|
+
trace_api=trace_api,
|
|
1891
|
+
api_test_case=api_test_case,
|
|
1892
|
+
ignore_errors=ignore_errors,
|
|
1893
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
1894
|
+
show_indicator=show_indicator,
|
|
1895
|
+
verbose_mode=verbose_mode,
|
|
1896
|
+
progress=progress,
|
|
1897
|
+
pbar_eval_id=pbar_eval_id,
|
|
1898
|
+
test_run_manager=test_run_manager,
|
|
1899
|
+
_use_bar_indicator=_use_bar_indicator,
|
|
1310
1900
|
)
|
|
1311
1901
|
|
|
1312
|
-
|
|
1313
|
-
|
|
1902
|
+
if _skip_metrics_for_error(span=span, trace=trace):
|
|
1903
|
+
return
|
|
1904
|
+
|
|
1905
|
+
child_tasks = [
|
|
1906
|
+
asyncio.create_task(dfs(trace, child))
|
|
1907
|
+
for child in span.children
|
|
1908
|
+
]
|
|
1909
|
+
if child_tasks:
|
|
1910
|
+
try:
|
|
1911
|
+
await asyncio.wait_for(
|
|
1912
|
+
asyncio.gather(*child_tasks),
|
|
1913
|
+
timeout=_gather_timeout(),
|
|
1914
|
+
)
|
|
1915
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
1916
|
+
for t in child_tasks:
|
|
1917
|
+
if not t.done():
|
|
1918
|
+
t.cancel()
|
|
1919
|
+
await asyncio.gather(*child_tasks, return_exceptions=True)
|
|
1920
|
+
raise
|
|
1921
|
+
|
|
1922
|
+
if not _skip_metrics_for_error(trace=current_trace):
|
|
1923
|
+
if current_trace and current_trace.root_spans:
|
|
1924
|
+
await dfs(current_trace, current_trace.root_spans[0])
|
|
1925
|
+
else:
|
|
1926
|
+
if (
|
|
1927
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
1928
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
1929
|
+
):
|
|
1930
|
+
logger.debug(
|
|
1931
|
+
"Skipping DFS: empty trace or no root spans (trace=%s)",
|
|
1932
|
+
current_trace.uuid if current_trace else None,
|
|
1933
|
+
)
|
|
1934
|
+
except asyncio.CancelledError:
|
|
1935
|
+
# mark any unfinished metrics as cancelled
|
|
1936
|
+
cancel_msg = (
|
|
1937
|
+
"Timed out/cancelled while evaluating agentic test case. "
|
|
1938
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1939
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1940
|
+
)
|
|
1941
|
+
|
|
1942
|
+
if trace_metrics:
|
|
1943
|
+
for m in trace_metrics:
|
|
1944
|
+
if getattr(m, "skipped", False):
|
|
1945
|
+
continue
|
|
1946
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
1947
|
+
m, "error", None
|
|
1948
|
+
):
|
|
1949
|
+
m.success = False
|
|
1950
|
+
m.error = cancel_msg
|
|
1951
|
+
|
|
1952
|
+
if trace is not None and trace.metrics:
|
|
1953
|
+
for m in trace.metrics:
|
|
1954
|
+
if getattr(m, "skipped", False):
|
|
1955
|
+
continue
|
|
1956
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
1957
|
+
m, "error", None
|
|
1958
|
+
):
|
|
1959
|
+
m.success = False
|
|
1960
|
+
m.error = cancel_msg
|
|
1961
|
+
if not ignore_errors:
|
|
1962
|
+
raise
|
|
1963
|
+
finally:
|
|
1964
|
+
try:
|
|
1965
|
+
if api_test_case is None:
|
|
1966
|
+
if test_case is None:
|
|
1967
|
+
test_case = LLMTestCase(
|
|
1968
|
+
input=golden.input,
|
|
1969
|
+
actual_output=None,
|
|
1970
|
+
expected_output=None,
|
|
1971
|
+
context=None,
|
|
1972
|
+
retrieval_context=None,
|
|
1973
|
+
additional_metadata=golden.additional_metadata,
|
|
1974
|
+
tools_called=None,
|
|
1975
|
+
expected_tools=None,
|
|
1976
|
+
comments=golden.comments,
|
|
1977
|
+
name=golden.name,
|
|
1978
|
+
_dataset_alias=golden._dataset_alias,
|
|
1979
|
+
_dataset_id=golden._dataset_id,
|
|
1980
|
+
)
|
|
1981
|
+
if trace is not None and trace_api is None:
|
|
1982
|
+
trace_api = create_api_trace(trace, golden)
|
|
1314
1983
|
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1984
|
+
api_test_case = create_api_test_case(
|
|
1985
|
+
test_case=test_case,
|
|
1986
|
+
trace=trace_api,
|
|
1987
|
+
index=(count if not _is_assert_test else None),
|
|
1988
|
+
)
|
|
1319
1989
|
|
|
1320
|
-
|
|
1990
|
+
# attach MetricData for any trace metrics we marked above
|
|
1991
|
+
if trace_metrics:
|
|
1992
|
+
for m in trace_metrics:
|
|
1993
|
+
if getattr(m, "skipped", False):
|
|
1994
|
+
continue
|
|
1995
|
+
api_test_case.update_metric_data(create_metric_data(m))
|
|
1996
|
+
|
|
1997
|
+
# If nothing set success yet, mark the case failed
|
|
1998
|
+
if api_test_case.success is None:
|
|
1999
|
+
api_test_case.update_status(False)
|
|
2000
|
+
|
|
2001
|
+
# test_run_manager.update_test_run returns early if api_test_case.metrics_data is an empty list.
|
|
2002
|
+
# Set it to None to ensure the test_case is added
|
|
2003
|
+
if api_test_case.metrics_data == [] and api_test_case.trace is None:
|
|
2004
|
+
api_test_case.metrics_data = None
|
|
2005
|
+
|
|
2006
|
+
# Duration & persist
|
|
2007
|
+
test_end_time = time.perf_counter()
|
|
2008
|
+
run_duration = test_end_time - test_start_time
|
|
2009
|
+
api_test_case.update_run_duration(run_duration)
|
|
2010
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
2011
|
+
|
|
2012
|
+
# Build results and de-duplicate against trace results
|
|
2013
|
+
main_result = create_test_result(api_test_case)
|
|
2014
|
+
trace_results = (
|
|
2015
|
+
extract_trace_test_results(trace_api)
|
|
2016
|
+
if trace_api is not None
|
|
2017
|
+
else []
|
|
2018
|
+
)
|
|
2019
|
+
unique_trace_results = filter_duplicate_results(
|
|
2020
|
+
main_result, trace_results
|
|
2021
|
+
)
|
|
2022
|
+
test_results.append(main_result)
|
|
2023
|
+
test_results.extend(unique_trace_results)
|
|
2024
|
+
update_pbar(progress, pbar_id)
|
|
2025
|
+
finally:
|
|
2026
|
+
pass
|
|
1321
2027
|
|
|
1322
2028
|
|
|
1323
2029
|
async def _a_execute_span_test_case(
|
|
1324
2030
|
span: BaseSpan,
|
|
2031
|
+
current_trace: Trace,
|
|
1325
2032
|
trace_api: TraceApi,
|
|
1326
2033
|
api_test_case: LLMApiTestCase,
|
|
1327
2034
|
ignore_errors: bool,
|
|
@@ -1346,12 +2053,22 @@ async def _a_execute_span_test_case(
|
|
|
1346
2053
|
else:
|
|
1347
2054
|
trace_api.base_spans.append(api_span)
|
|
1348
2055
|
|
|
1349
|
-
if span
|
|
2056
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
2057
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
2058
|
+
api_span.error = span.error or _trace_error(current_trace)
|
|
2059
|
+
if progress and pbar_eval_id is not None:
|
|
2060
|
+
update_pbar(
|
|
2061
|
+
progress,
|
|
2062
|
+
pbar_eval_id,
|
|
2063
|
+
advance=count_metrics_in_span_subtree(span),
|
|
2064
|
+
)
|
|
2065
|
+
return
|
|
2066
|
+
|
|
2067
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
2068
|
+
if not metrics:
|
|
1350
2069
|
return
|
|
1351
2070
|
|
|
1352
|
-
|
|
1353
|
-
isinstance(metric, TaskCompletionMetric) for metric in span.metrics
|
|
1354
|
-
)
|
|
2071
|
+
requires_trace = any(metric.requires_trace for metric in metrics)
|
|
1355
2072
|
|
|
1356
2073
|
llm_test_case = None
|
|
1357
2074
|
if span.input:
|
|
@@ -1364,17 +2081,29 @@ async def _a_execute_span_test_case(
|
|
|
1364
2081
|
tools_called=span.tools_called,
|
|
1365
2082
|
expected_tools=span.expected_tools,
|
|
1366
2083
|
)
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
2084
|
+
|
|
2085
|
+
if not requires_trace:
|
|
2086
|
+
if llm_test_case is None:
|
|
2087
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
2088
|
+
api_span.error = format_error_text(
|
|
2089
|
+
DeepEvalError(
|
|
2090
|
+
"Span has metrics but no LLMTestCase. "
|
|
2091
|
+
"Are you sure you called `update_current_span()`?"
|
|
2092
|
+
)
|
|
2093
|
+
)
|
|
2094
|
+
if progress and pbar_eval_id is not None:
|
|
2095
|
+
update_pbar(
|
|
2096
|
+
progress,
|
|
2097
|
+
pbar_eval_id,
|
|
2098
|
+
advance=count_metrics_in_span_subtree(span),
|
|
2099
|
+
)
|
|
2100
|
+
return
|
|
1371
2101
|
|
|
1372
2102
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
1373
|
-
metrics: List[BaseMetric] = span.metrics
|
|
1374
2103
|
test_case: Optional[LLMTestCase] = llm_test_case
|
|
1375
2104
|
|
|
1376
2105
|
# add trace if task completion
|
|
1377
|
-
if
|
|
2106
|
+
if requires_trace:
|
|
1378
2107
|
if test_case is None:
|
|
1379
2108
|
test_case = LLMTestCase(input="None")
|
|
1380
2109
|
test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
|
|
@@ -1418,12 +2147,22 @@ async def _a_execute_trace_test_case(
|
|
|
1418
2147
|
pbar_eval_id: Optional[int],
|
|
1419
2148
|
_use_bar_indicator: bool,
|
|
1420
2149
|
):
|
|
1421
|
-
|
|
2150
|
+
|
|
2151
|
+
if _skip_metrics_for_error(trace=trace):
|
|
2152
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2153
|
+
if progress and pbar_eval_id is not None:
|
|
2154
|
+
update_pbar(
|
|
2155
|
+
progress,
|
|
2156
|
+
pbar_eval_id,
|
|
2157
|
+
advance=count_total_metrics_for_trace(trace),
|
|
2158
|
+
)
|
|
1422
2159
|
return
|
|
1423
2160
|
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
2161
|
+
metrics: List[BaseMetric] = list(trace.metrics or [])
|
|
2162
|
+
if not metrics:
|
|
2163
|
+
return
|
|
2164
|
+
|
|
2165
|
+
requires_trace = any(metric.requires_trace for metric in metrics)
|
|
1427
2166
|
|
|
1428
2167
|
llm_test_case = None
|
|
1429
2168
|
if trace.input:
|
|
@@ -1438,17 +2177,32 @@ async def _a_execute_trace_test_case(
|
|
|
1438
2177
|
tools_called=trace.tools_called,
|
|
1439
2178
|
expected_tools=trace.expected_tools,
|
|
1440
2179
|
)
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
2180
|
+
|
|
2181
|
+
if not requires_trace:
|
|
2182
|
+
if llm_test_case is None:
|
|
2183
|
+
trace.status = TraceSpanStatus.ERRORED
|
|
2184
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2185
|
+
if trace.root_spans:
|
|
2186
|
+
trace.root_spans[0].status = TraceSpanStatus.ERRORED
|
|
2187
|
+
trace.root_spans[0].error = format_error_text(
|
|
2188
|
+
DeepEvalError(
|
|
2189
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
2190
|
+
"Are you sure you called `update_current_trace()`?"
|
|
2191
|
+
)
|
|
2192
|
+
)
|
|
2193
|
+
if progress and pbar_eval_id is not None:
|
|
2194
|
+
update_pbar(
|
|
2195
|
+
progress,
|
|
2196
|
+
pbar_eval_id,
|
|
2197
|
+
advance=count_total_metrics_for_trace(trace),
|
|
2198
|
+
)
|
|
2199
|
+
return
|
|
1445
2200
|
|
|
1446
2201
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
1447
|
-
metrics: List[BaseMetric] = trace.metrics
|
|
1448
2202
|
test_case: Optional[LLMTestCase] = llm_test_case
|
|
1449
2203
|
|
|
1450
2204
|
# add trace if task completion
|
|
1451
|
-
if
|
|
2205
|
+
if requires_trace:
|
|
1452
2206
|
if test_case is None:
|
|
1453
2207
|
test_case = LLMTestCase(input="None")
|
|
1454
2208
|
test_case._trace_dict = trace_manager.create_nested_spans_dict(
|
|
@@ -1578,11 +2332,12 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1578
2332
|
pbar_eval_id: Optional[int] = None,
|
|
1579
2333
|
):
|
|
1580
2334
|
# Create API Span
|
|
1581
|
-
metrics: List[BaseMetric] = span.metrics
|
|
2335
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
1582
2336
|
|
|
1583
2337
|
api_span: BaseApiSpan = (
|
|
1584
2338
|
trace_manager._convert_span_to_api_span(span)
|
|
1585
2339
|
)
|
|
2340
|
+
|
|
1586
2341
|
if isinstance(span, AgentSpan):
|
|
1587
2342
|
trace_api.agent_spans.append(api_span)
|
|
1588
2343
|
elif isinstance(span, LlmSpan):
|
|
@@ -1595,9 +2350,30 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1595
2350
|
else:
|
|
1596
2351
|
trace_api.base_spans.append(api_span)
|
|
1597
2352
|
|
|
2353
|
+
# Skip errored trace/span
|
|
2354
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
2355
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
2356
|
+
api_span.error = span.error or _trace_error(
|
|
2357
|
+
current_trace
|
|
2358
|
+
)
|
|
2359
|
+
if progress and pbar_eval_id is not None:
|
|
2360
|
+
update_pbar(
|
|
2361
|
+
progress,
|
|
2362
|
+
pbar_eval_id,
|
|
2363
|
+
advance=count_metrics_in_span_subtree(span),
|
|
2364
|
+
)
|
|
2365
|
+
return
|
|
2366
|
+
|
|
1598
2367
|
for child in span.children:
|
|
1599
2368
|
dfs(child, progress, pbar_eval_id)
|
|
1600
2369
|
|
|
2370
|
+
if not span.metrics:
|
|
2371
|
+
return
|
|
2372
|
+
|
|
2373
|
+
requires_trace = any(
|
|
2374
|
+
metric.requires_trace for metric in metrics
|
|
2375
|
+
)
|
|
2376
|
+
|
|
1601
2377
|
llm_test_case = None
|
|
1602
2378
|
if span.input is not None:
|
|
1603
2379
|
llm_test_case = LLMTestCase(
|
|
@@ -1613,20 +2389,29 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1613
2389
|
tools_called=span.tools_called,
|
|
1614
2390
|
expected_tools=span.expected_tools,
|
|
1615
2391
|
)
|
|
1616
|
-
if span.metrics is None or llm_test_case is None:
|
|
1617
|
-
return
|
|
1618
2392
|
|
|
1619
|
-
|
|
1620
|
-
isinstance(metric, TaskCompletionMetric)
|
|
1621
|
-
for metric in metrics
|
|
1622
|
-
)
|
|
1623
|
-
|
|
1624
|
-
if has_task_completion:
|
|
2393
|
+
if requires_trace:
|
|
1625
2394
|
if llm_test_case is None:
|
|
1626
2395
|
llm_test_case = LLMTestCase(input="None")
|
|
1627
2396
|
llm_test_case._trace_dict = (
|
|
1628
2397
|
trace_manager.create_nested_spans_dict(span)
|
|
1629
2398
|
)
|
|
2399
|
+
else:
|
|
2400
|
+
if llm_test_case is None:
|
|
2401
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
2402
|
+
api_span.error = format_error_text(
|
|
2403
|
+
DeepEvalError(
|
|
2404
|
+
"Span has metrics but no LLMTestCase. "
|
|
2405
|
+
"Are you sure you called `update_current_span()`?"
|
|
2406
|
+
)
|
|
2407
|
+
)
|
|
2408
|
+
if progress and pbar_eval_id is not None:
|
|
2409
|
+
update_pbar(
|
|
2410
|
+
progress,
|
|
2411
|
+
pbar_eval_id,
|
|
2412
|
+
advance=count_metrics_in_span_subtree(span),
|
|
2413
|
+
)
|
|
2414
|
+
return
|
|
1630
2415
|
|
|
1631
2416
|
# Preparing metric calculation
|
|
1632
2417
|
api_span.metrics_data = []
|
|
@@ -1670,77 +2455,123 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1670
2455
|
start_time = time.perf_counter()
|
|
1671
2456
|
|
|
1672
2457
|
# Handle trace-level metrics
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
input=str(current_trace.input),
|
|
1683
|
-
actual_output=(
|
|
1684
|
-
str(current_trace.output)
|
|
1685
|
-
if current_trace.output is not None
|
|
1686
|
-
else None
|
|
2458
|
+
skip_metrics_for_this_golden = False
|
|
2459
|
+
if _skip_metrics_for_error(trace=current_trace):
|
|
2460
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2461
|
+
if progress and pbar_eval_id is not None:
|
|
2462
|
+
update_pbar(
|
|
2463
|
+
progress,
|
|
2464
|
+
pbar_eval_id,
|
|
2465
|
+
advance=count_total_metrics_for_trace(
|
|
2466
|
+
current_trace
|
|
1687
2467
|
),
|
|
1688
|
-
expected_output=current_trace.expected_output,
|
|
1689
|
-
context=current_trace.context,
|
|
1690
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1691
|
-
tools_called=current_trace.tools_called,
|
|
1692
|
-
expected_tools=current_trace.expected_tools,
|
|
1693
2468
|
)
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
2469
|
+
else:
|
|
2470
|
+
if current_trace.metrics:
|
|
2471
|
+
requires_trace = any(
|
|
2472
|
+
metric.requires_trace
|
|
2473
|
+
for metric in current_trace.metrics
|
|
1697
2474
|
)
|
|
1698
2475
|
|
|
1699
|
-
|
|
1700
|
-
if
|
|
1701
|
-
llm_test_case = LLMTestCase(
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
2476
|
+
llm_test_case = None
|
|
2477
|
+
if current_trace.input:
|
|
2478
|
+
llm_test_case = LLMTestCase(
|
|
2479
|
+
input=str(current_trace.input),
|
|
2480
|
+
actual_output=(
|
|
2481
|
+
str(current_trace.output)
|
|
2482
|
+
if current_trace.output is not None
|
|
2483
|
+
else None
|
|
2484
|
+
),
|
|
2485
|
+
expected_output=current_trace.expected_output,
|
|
2486
|
+
context=current_trace.context,
|
|
2487
|
+
retrieval_context=current_trace.retrieval_context,
|
|
2488
|
+
tools_called=current_trace.tools_called,
|
|
2489
|
+
expected_tools=current_trace.expected_tools,
|
|
1705
2490
|
)
|
|
1706
|
-
)
|
|
1707
|
-
|
|
1708
|
-
for metric in current_trace.metrics:
|
|
1709
|
-
metric.skipped = False
|
|
1710
|
-
metric.error = None
|
|
1711
|
-
if display_config.verbose_mode is not None:
|
|
1712
|
-
metric.verbose_mode = display_config.verbose_mode
|
|
1713
|
-
|
|
1714
|
-
trace_api.metrics_data = []
|
|
1715
|
-
for metric in current_trace.metrics:
|
|
1716
|
-
res = _execute_metric(
|
|
1717
|
-
metric=metric,
|
|
1718
|
-
test_case=llm_test_case,
|
|
1719
|
-
show_metric_indicator=show_metric_indicator,
|
|
1720
|
-
in_component=True,
|
|
1721
|
-
error_config=error_config,
|
|
1722
|
-
)
|
|
1723
|
-
if res == "skip":
|
|
1724
|
-
continue
|
|
1725
|
-
|
|
1726
|
-
if not metric.skipped:
|
|
1727
|
-
metric_data = create_metric_data(metric)
|
|
1728
|
-
trace_api.metrics_data.append(metric_data)
|
|
1729
|
-
api_test_case.update_metric_data(metric_data)
|
|
1730
|
-
api_test_case.update_status(metric_data.success)
|
|
1731
|
-
update_pbar(progress, pbar_eval_id)
|
|
1732
|
-
|
|
1733
|
-
# Then handle span-level metrics
|
|
1734
|
-
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
1735
|
-
end_time = time.perf_counter()
|
|
1736
|
-
run_duration = end_time - start_time
|
|
1737
2491
|
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
2492
|
+
if requires_trace:
|
|
2493
|
+
if llm_test_case is None:
|
|
2494
|
+
llm_test_case = LLMTestCase(input="None")
|
|
2495
|
+
llm_test_case._trace_dict = (
|
|
2496
|
+
trace_manager.create_nested_spans_dict(
|
|
2497
|
+
current_trace.root_spans[0]
|
|
2498
|
+
)
|
|
2499
|
+
)
|
|
2500
|
+
else:
|
|
2501
|
+
if llm_test_case is None:
|
|
2502
|
+
current_trace.status = TraceSpanStatus.ERRORED
|
|
2503
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2504
|
+
if current_trace.root_spans:
|
|
2505
|
+
current_trace.root_spans[0].status = (
|
|
2506
|
+
TraceSpanStatus.ERRORED
|
|
2507
|
+
)
|
|
2508
|
+
current_trace.root_spans[0].error = (
|
|
2509
|
+
format_error_text(
|
|
2510
|
+
DeepEvalError(
|
|
2511
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
2512
|
+
"Are you sure you called `update_current_trace()`?"
|
|
2513
|
+
)
|
|
2514
|
+
)
|
|
2515
|
+
)
|
|
2516
|
+
if progress and pbar_eval_id is not None:
|
|
2517
|
+
update_pbar(
|
|
2518
|
+
progress,
|
|
2519
|
+
pbar_eval_id,
|
|
2520
|
+
advance=count_total_metrics_for_trace(
|
|
2521
|
+
current_trace
|
|
2522
|
+
),
|
|
2523
|
+
)
|
|
2524
|
+
skip_metrics_for_this_golden = True
|
|
2525
|
+
|
|
2526
|
+
if not skip_metrics_for_this_golden:
|
|
2527
|
+
for metric in current_trace.metrics:
|
|
2528
|
+
metric.skipped = False
|
|
2529
|
+
metric.error = None
|
|
2530
|
+
if display_config.verbose_mode is not None:
|
|
2531
|
+
metric.verbose_mode = (
|
|
2532
|
+
display_config.verbose_mode
|
|
2533
|
+
)
|
|
2534
|
+
|
|
2535
|
+
trace_api.metrics_data = []
|
|
2536
|
+
for metric in current_trace.metrics:
|
|
2537
|
+
res = _execute_metric(
|
|
2538
|
+
metric=metric,
|
|
2539
|
+
test_case=llm_test_case,
|
|
2540
|
+
show_metric_indicator=show_metric_indicator,
|
|
2541
|
+
in_component=True,
|
|
2542
|
+
error_config=error_config,
|
|
2543
|
+
)
|
|
2544
|
+
if res == "skip":
|
|
2545
|
+
continue
|
|
2546
|
+
|
|
2547
|
+
if not metric.skipped:
|
|
2548
|
+
metric_data = create_metric_data(metric)
|
|
2549
|
+
trace_api.metrics_data.append(metric_data)
|
|
2550
|
+
api_test_case.update_metric_data(
|
|
2551
|
+
metric_data
|
|
2552
|
+
)
|
|
2553
|
+
api_test_case.update_status(
|
|
2554
|
+
metric_data.success
|
|
2555
|
+
)
|
|
2556
|
+
update_pbar(progress, pbar_eval_id)
|
|
2557
|
+
|
|
2558
|
+
# Then handle span-level metrics
|
|
2559
|
+
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
2560
|
+
|
|
2561
|
+
end_time = time.perf_counter()
|
|
2562
|
+
run_duration = end_time - start_time
|
|
2563
|
+
# Update test run
|
|
2564
|
+
api_test_case.update_run_duration(run_duration)
|
|
2565
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
2566
|
+
main_result = create_test_result(api_test_case)
|
|
2567
|
+
trace_results = extract_trace_test_results(trace_api)
|
|
2568
|
+
unique_trace_results = filter_duplicate_results(
|
|
2569
|
+
main_result, trace_results
|
|
2570
|
+
)
|
|
2571
|
+
test_results.append(main_result)
|
|
2572
|
+
test_results.extend(unique_trace_results)
|
|
1742
2573
|
|
|
1743
|
-
|
|
2574
|
+
update_pbar(progress, pbar_id)
|
|
1744
2575
|
|
|
1745
2576
|
try:
|
|
1746
2577
|
if display_config.show_indicator and _use_bar_indicator:
|
|
@@ -1798,9 +2629,8 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1798
2629
|
|
|
1799
2630
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
1800
2631
|
async with semaphore:
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
)
|
|
2632
|
+
timeout = _per_task_timeout()
|
|
2633
|
+
return await _await_with_outer_deadline(coroutine, timeout=timeout)
|
|
1804
2634
|
|
|
1805
2635
|
def evaluate_test_cases(
|
|
1806
2636
|
progress: Optional[Progress] = None,
|
|
@@ -1841,39 +2671,146 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1841
2671
|
}
|
|
1842
2672
|
|
|
1843
2673
|
def on_task_done(t: asyncio.Task):
|
|
2674
|
+
cancelled = False
|
|
2675
|
+
exc = None
|
|
2676
|
+
trace = None
|
|
2677
|
+
root = None
|
|
2678
|
+
resolved_trace_from_task = False
|
|
2679
|
+
resolved_root_from_task = False
|
|
2680
|
+
|
|
2681
|
+
# Task.exception() raises CancelledError if task was cancelled
|
|
2682
|
+
try:
|
|
2683
|
+
exc = t.exception()
|
|
2684
|
+
except asyncio.CancelledError:
|
|
2685
|
+
cancelled = True
|
|
2686
|
+
exc = None
|
|
2687
|
+
|
|
2688
|
+
meta = task_meta.get(t, {})
|
|
2689
|
+
golden_index = meta.get("golden_index")
|
|
2690
|
+
|
|
2691
|
+
if golden_index is not None and 0 <= golden_index < len(
|
|
2692
|
+
goldens
|
|
2693
|
+
):
|
|
2694
|
+
golden = goldens[golden_index]
|
|
2695
|
+
|
|
2696
|
+
def _mark_trace_error(trace, root, msg: str):
|
|
2697
|
+
now = time.perf_counter()
|
|
2698
|
+
trace.status = TraceSpanStatus.ERRORED
|
|
2699
|
+
# Close the trace so the API layer has a proper endTime
|
|
2700
|
+
if trace.end_time is None:
|
|
2701
|
+
trace.end_time = now
|
|
2702
|
+
if root:
|
|
2703
|
+
root.status = TraceSpanStatus.ERRORED
|
|
2704
|
+
root.error = msg
|
|
2705
|
+
if root.end_time is None:
|
|
2706
|
+
root.end_time = now
|
|
2707
|
+
|
|
2708
|
+
if exc is not None:
|
|
2709
|
+
msg = format_error_text(exc)
|
|
2710
|
+
trace, root = _resolve_trace_and_root_for_task(t)
|
|
2711
|
+
resolved_trace_from_task = bool(trace)
|
|
2712
|
+
resolved_root_from_task = bool(root)
|
|
2713
|
+
if trace:
|
|
2714
|
+
_mark_trace_error(trace, root, msg)
|
|
2715
|
+
else:
|
|
2716
|
+
for (
|
|
2717
|
+
trace
|
|
2718
|
+
) in trace_manager.integration_traces_to_evaluate:
|
|
2719
|
+
if (
|
|
2720
|
+
trace_manager.trace_uuid_to_golden.get(
|
|
2721
|
+
trace.uuid
|
|
2722
|
+
)
|
|
2723
|
+
is golden
|
|
2724
|
+
):
|
|
2725
|
+
root = _pick_root_for_marking(trace)
|
|
2726
|
+
_mark_trace_error(trace, root, msg)
|
|
2727
|
+
break
|
|
2728
|
+
|
|
2729
|
+
elif cancelled or t.cancelled():
|
|
2730
|
+
cancel_exc = DeepEvalError(
|
|
2731
|
+
"Task was cancelled (likely due to timeout)."
|
|
2732
|
+
)
|
|
2733
|
+
msg = format_error_text(cancel_exc)
|
|
2734
|
+
trace, root = _resolve_trace_and_root_for_task(t)
|
|
2735
|
+
resolved_trace_from_task = bool(trace)
|
|
2736
|
+
resolved_root_from_task = bool(root)
|
|
2737
|
+
if trace:
|
|
2738
|
+
_mark_trace_error(trace, root, msg)
|
|
2739
|
+
else:
|
|
2740
|
+
for (
|
|
2741
|
+
trace
|
|
2742
|
+
) in trace_manager.integration_traces_to_evaluate:
|
|
2743
|
+
if (
|
|
2744
|
+
trace_manager.trace_uuid_to_golden.get(
|
|
2745
|
+
trace.uuid
|
|
2746
|
+
)
|
|
2747
|
+
is golden
|
|
2748
|
+
):
|
|
2749
|
+
root = _pick_root_for_marking(trace)
|
|
2750
|
+
_mark_trace_error(trace, root, msg)
|
|
2751
|
+
break
|
|
2752
|
+
|
|
1844
2753
|
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1845
2754
|
# Using info level here to make it easy to spot these logs.
|
|
1846
|
-
|
|
1847
|
-
meta = task_meta.get(t, {})
|
|
2755
|
+
golden_name = meta.get("golden_name")
|
|
1848
2756
|
duration = time.perf_counter() - meta.get(
|
|
1849
2757
|
"started", started
|
|
1850
2758
|
)
|
|
1851
2759
|
|
|
1852
|
-
if
|
|
2760
|
+
if cancelled or exc is not None:
|
|
2761
|
+
if not resolved_trace_from_task:
|
|
2762
|
+
logger.warning(
|
|
2763
|
+
"[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
|
|
2764
|
+
t.get_name(),
|
|
2765
|
+
golden_name,
|
|
2766
|
+
)
|
|
2767
|
+
elif not resolved_root_from_task:
|
|
2768
|
+
logger.warning(
|
|
2769
|
+
"[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
|
|
2770
|
+
t.get_name(),
|
|
2771
|
+
trace.uuid,
|
|
2772
|
+
)
|
|
2773
|
+
|
|
2774
|
+
if cancelled:
|
|
1853
2775
|
logger.info(
|
|
1854
2776
|
"[deepeval] task CANCELLED %s after %.2fs meta=%r",
|
|
1855
2777
|
t.get_name(),
|
|
1856
2778
|
duration,
|
|
1857
2779
|
meta,
|
|
1858
2780
|
)
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
else:
|
|
1870
|
-
logger.info(
|
|
1871
|
-
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
1872
|
-
t.get_name(),
|
|
1873
|
-
duration,
|
|
1874
|
-
meta.get("golden_index"),
|
|
2781
|
+
elif exc is not None:
|
|
2782
|
+
|
|
2783
|
+
show_trace = bool(
|
|
2784
|
+
get_settings().DEEPEVAL_LOG_STACK_TRACES
|
|
2785
|
+
)
|
|
2786
|
+
exc_info = (
|
|
2787
|
+
(
|
|
2788
|
+
type(exc),
|
|
2789
|
+
exc,
|
|
2790
|
+
getattr(exc, "__traceback__", None),
|
|
1875
2791
|
)
|
|
2792
|
+
if show_trace
|
|
2793
|
+
else None
|
|
2794
|
+
)
|
|
2795
|
+
logger.error(
|
|
2796
|
+
"[deepeval] task ERROR %s after %.2fs meta=%r",
|
|
2797
|
+
t.get_name(),
|
|
2798
|
+
duration,
|
|
2799
|
+
meta,
|
|
2800
|
+
exc_info=exc_info,
|
|
2801
|
+
)
|
|
2802
|
+
else:
|
|
2803
|
+
logger.info(
|
|
2804
|
+
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
2805
|
+
t.get_name(),
|
|
2806
|
+
duration,
|
|
2807
|
+
meta.get("golden_index"),
|
|
2808
|
+
)
|
|
1876
2809
|
|
|
2810
|
+
try:
|
|
2811
|
+
trace_manager.task_bindings.pop(t, None)
|
|
2812
|
+
except Exception:
|
|
2813
|
+
pass
|
|
1877
2814
|
update_pbar(progress, pbar_callback_id)
|
|
1878
2815
|
update_pbar(progress, pbar_id)
|
|
1879
2816
|
|
|
@@ -1918,7 +2855,8 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1918
2855
|
timeout=_gather_timeout(),
|
|
1919
2856
|
)
|
|
1920
2857
|
)
|
|
1921
|
-
|
|
2858
|
+
|
|
2859
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
1922
2860
|
import traceback
|
|
1923
2861
|
|
|
1924
2862
|
pending = [t for t in created_tasks if not t.done()]
|
|
@@ -1987,10 +2925,11 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1987
2925
|
]
|
|
1988
2926
|
|
|
1989
2927
|
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
2928
|
+
if len(leftovers) > 0:
|
|
2929
|
+
logger.warning(
|
|
2930
|
+
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
2931
|
+
len(leftovers),
|
|
2932
|
+
)
|
|
1994
2933
|
for t in leftovers:
|
|
1995
2934
|
meta = task_meta.get(t, {})
|
|
1996
2935
|
name = t.get_name()
|
|
@@ -2130,7 +3069,10 @@ async def _a_evaluate_traces(
|
|
|
2130
3069
|
|
|
2131
3070
|
async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
|
|
2132
3071
|
async with semaphore:
|
|
2133
|
-
|
|
3072
|
+
timeout = _per_task_timeout()
|
|
3073
|
+
return await _await_with_outer_deadline(
|
|
3074
|
+
func, *args, timeout=timeout, **kwargs
|
|
3075
|
+
)
|
|
2134
3076
|
|
|
2135
3077
|
eval_tasks = []
|
|
2136
3078
|
# Here, we will work off a fixed-set copy to avoid surprises from potential
|
|
@@ -2173,7 +3115,18 @@ async def _a_evaluate_traces(
|
|
|
2173
3115
|
)
|
|
2174
3116
|
eval_tasks.append(asyncio.create_task(task))
|
|
2175
3117
|
await asyncio.sleep(throttle_value)
|
|
2176
|
-
|
|
3118
|
+
|
|
3119
|
+
try:
|
|
3120
|
+
await asyncio.wait_for(
|
|
3121
|
+
asyncio.gather(*eval_tasks),
|
|
3122
|
+
timeout=_gather_timeout(),
|
|
3123
|
+
)
|
|
3124
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
3125
|
+
for t in eval_tasks:
|
|
3126
|
+
if not t.done():
|
|
3127
|
+
t.cancel()
|
|
3128
|
+
await asyncio.gather(*eval_tasks, return_exceptions=True)
|
|
3129
|
+
raise
|
|
2177
3130
|
|
|
2178
3131
|
|
|
2179
3132
|
async def _evaluate_test_case_pairs(
|
|
@@ -2196,7 +3149,10 @@ async def _evaluate_test_case_pairs(
|
|
|
2196
3149
|
|
|
2197
3150
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
2198
3151
|
async with semaphore:
|
|
2199
|
-
|
|
3152
|
+
timeout = _per_task_timeout()
|
|
3153
|
+
return await _await_with_outer_deadline(
|
|
3154
|
+
func, *args, timeout=timeout, **kwargs
|
|
3155
|
+
)
|
|
2200
3156
|
|
|
2201
3157
|
tasks = []
|
|
2202
3158
|
for count, test_case_pair in enumerate(test_case_pairs):
|
|
@@ -2229,7 +3185,19 @@ async def _evaluate_test_case_pairs(
|
|
|
2229
3185
|
)
|
|
2230
3186
|
tasks.append(asyncio.create_task(task))
|
|
2231
3187
|
await asyncio.sleep(throttle_value)
|
|
2232
|
-
|
|
3188
|
+
|
|
3189
|
+
try:
|
|
3190
|
+
await asyncio.wait_for(
|
|
3191
|
+
asyncio.gather(*tasks),
|
|
3192
|
+
timeout=_gather_timeout(),
|
|
3193
|
+
)
|
|
3194
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
3195
|
+
# Cancel any still-pending tasks and drain them
|
|
3196
|
+
for t in tasks:
|
|
3197
|
+
if not t.done():
|
|
3198
|
+
t.cancel()
|
|
3199
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
3200
|
+
raise
|
|
2233
3201
|
|
|
2234
3202
|
|
|
2235
3203
|
def _execute_metric(
|
|
@@ -2248,10 +3216,13 @@ def _execute_metric(
|
|
|
2248
3216
|
)
|
|
2249
3217
|
except MissingTestCaseParamsError as e:
|
|
2250
3218
|
if error_config.skip_on_missing_params:
|
|
3219
|
+
metric.skipped = True
|
|
3220
|
+
metric.error = None
|
|
3221
|
+
metric.success = None
|
|
2251
3222
|
return "skip"
|
|
2252
3223
|
else:
|
|
2253
3224
|
if error_config.ignore_errors:
|
|
2254
|
-
metric.error =
|
|
3225
|
+
metric.error = format_error_text(e)
|
|
2255
3226
|
metric.success = False
|
|
2256
3227
|
else:
|
|
2257
3228
|
raise
|
|
@@ -2260,22 +3231,25 @@ def _execute_metric(
|
|
|
2260
3231
|
metric.measure(test_case)
|
|
2261
3232
|
except MissingTestCaseParamsError as e:
|
|
2262
3233
|
if error_config.skip_on_missing_params:
|
|
3234
|
+
metric.skipped = True
|
|
3235
|
+
metric.error = None
|
|
3236
|
+
metric.success = None
|
|
2263
3237
|
return "skip"
|
|
2264
3238
|
else:
|
|
2265
3239
|
if error_config.ignore_errors:
|
|
2266
|
-
metric.error =
|
|
3240
|
+
metric.error = format_error_text(e)
|
|
2267
3241
|
metric.success = False
|
|
2268
3242
|
else:
|
|
2269
3243
|
raise
|
|
2270
3244
|
except Exception as e:
|
|
2271
3245
|
if error_config.ignore_errors:
|
|
2272
|
-
metric.error =
|
|
3246
|
+
metric.error = format_error_text(e)
|
|
2273
3247
|
metric.success = False
|
|
2274
3248
|
else:
|
|
2275
3249
|
raise
|
|
2276
3250
|
except Exception as e:
|
|
2277
3251
|
if error_config.ignore_errors:
|
|
2278
|
-
metric.error =
|
|
3252
|
+
metric.error = format_error_text(e)
|
|
2279
3253
|
metric.success = False
|
|
2280
3254
|
else:
|
|
2281
3255
|
raise
|