deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -43,15 +43,19 @@ from deepeval.tracing.api import (
|
|
|
43
43
|
)
|
|
44
44
|
from deepeval.dataset import Golden
|
|
45
45
|
from deepeval.contextvars import set_current_golden, reset_current_golden
|
|
46
|
-
from deepeval.errors import MissingTestCaseParamsError
|
|
46
|
+
from deepeval.errors import MissingTestCaseParamsError, DeepEvalError
|
|
47
47
|
from deepeval.metrics.utils import copy_metrics
|
|
48
|
-
from deepeval.utils import
|
|
48
|
+
from deepeval.utils import (
|
|
49
|
+
get_or_create_event_loop,
|
|
50
|
+
shorten,
|
|
51
|
+
len_medium,
|
|
52
|
+
format_error_text,
|
|
53
|
+
)
|
|
49
54
|
from deepeval.telemetry import capture_evaluation_run
|
|
50
55
|
from deepeval.metrics import (
|
|
51
56
|
BaseMetric,
|
|
52
57
|
BaseConversationalMetric,
|
|
53
58
|
BaseMultimodalMetric,
|
|
54
|
-
TaskCompletionMetric,
|
|
55
59
|
)
|
|
56
60
|
from deepeval.metrics.indicator import (
|
|
57
61
|
measure_metrics_with_indicator,
|
|
@@ -61,6 +65,7 @@ from deepeval.test_case import (
|
|
|
61
65
|
ConversationalTestCase,
|
|
62
66
|
MLLMTestCase,
|
|
63
67
|
)
|
|
68
|
+
from deepeval.test_case.api import create_api_test_case
|
|
64
69
|
from deepeval.test_run import (
|
|
65
70
|
global_test_run_manager,
|
|
66
71
|
LLMApiTestCase,
|
|
@@ -80,19 +85,127 @@ from deepeval.evaluate.utils import (
|
|
|
80
85
|
create_api_trace,
|
|
81
86
|
create_metric_data,
|
|
82
87
|
create_test_result,
|
|
83
|
-
create_api_test_case,
|
|
84
88
|
count_metrics_in_trace,
|
|
89
|
+
count_total_metrics_for_trace,
|
|
90
|
+
count_metrics_in_span_subtree,
|
|
85
91
|
extract_trace_test_results,
|
|
86
92
|
)
|
|
87
93
|
from deepeval.utils import add_pbar, update_pbar, custom_console
|
|
88
|
-
from deepeval.
|
|
89
|
-
from deepeval.tracing.
|
|
94
|
+
from deepeval.tracing.types import TestCaseMetricPair, TraceSpanStatus
|
|
95
|
+
from deepeval.tracing.api import TraceSpanApiStatus
|
|
90
96
|
from deepeval.config.settings import get_settings
|
|
91
|
-
|
|
97
|
+
from deepeval.test_run import TEMP_FILE_PATH
|
|
98
|
+
from deepeval.confident.api import is_confident
|
|
99
|
+
from deepeval.test_run.hyperparameters import (
|
|
100
|
+
process_hyperparameters,
|
|
101
|
+
process_prompts,
|
|
102
|
+
)
|
|
92
103
|
|
|
93
104
|
logger = logging.getLogger(__name__)
|
|
94
105
|
|
|
95
106
|
|
|
107
|
+
def _skip_metrics_for_error(
|
|
108
|
+
span: Optional[BaseSpan] = None,
|
|
109
|
+
trace: Optional[Trace] = None,
|
|
110
|
+
) -> bool:
|
|
111
|
+
# trace failure: skip everything under this trace
|
|
112
|
+
if trace is not None and trace.status == TraceSpanStatus.ERRORED:
|
|
113
|
+
return True
|
|
114
|
+
# span failure: skip this span’s metrics
|
|
115
|
+
if span is not None and span.status == TraceSpanStatus.ERRORED:
|
|
116
|
+
return True
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _trace_error(current_trace: Trace) -> Optional[str]:
|
|
121
|
+
def _first_err(s: BaseSpan) -> Optional[str]:
|
|
122
|
+
if s.status == TraceSpanStatus.ERRORED and s.error:
|
|
123
|
+
return s.error
|
|
124
|
+
for c in s.children or []:
|
|
125
|
+
e = _first_err(c)
|
|
126
|
+
if e:
|
|
127
|
+
return e
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
for root in current_trace.root_spans or []:
|
|
131
|
+
e = _first_err(root)
|
|
132
|
+
if e:
|
|
133
|
+
return e
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _get_trace_by_uuid_anywhere(trace_uuid: str):
|
|
138
|
+
"""
|
|
139
|
+
Resolver for a trace UUID across the manager's state.
|
|
140
|
+
|
|
141
|
+
First tries the manager's indexed lookup, which (covers active/in-flight traces,
|
|
142
|
+
then does a linear scan of the full `trace_manager.traces` list, which covers
|
|
143
|
+
traces that were recorded/closed earlier or not yet indexed. Returns
|
|
144
|
+
the concrete Trace object or None if not found.
|
|
145
|
+
"""
|
|
146
|
+
tr = trace_manager.get_trace_by_uuid(trace_uuid)
|
|
147
|
+
if tr:
|
|
148
|
+
return tr
|
|
149
|
+
for tr in trace_manager.traces:
|
|
150
|
+
if tr.uuid == trace_uuid:
|
|
151
|
+
return tr
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _pick_root_for_marking(trace):
|
|
156
|
+
"""
|
|
157
|
+
Choose the most appropriate root span to annotate on error/cancel.
|
|
158
|
+
|
|
159
|
+
Heuristic:
|
|
160
|
+
- Prefer the most recent open root, which will have no `end_time` since this is the
|
|
161
|
+
span currently in flight.
|
|
162
|
+
- If none are open, use the last root span if it exists.
|
|
163
|
+
- If the trace has no roots, return None.
|
|
164
|
+
|
|
165
|
+
This favors marking the active root in multi root traces while remaining
|
|
166
|
+
stable for already closed traces.
|
|
167
|
+
"""
|
|
168
|
+
open_roots = [rs for rs in trace.root_spans if rs.end_time is None]
|
|
169
|
+
return (
|
|
170
|
+
open_roots[-1]
|
|
171
|
+
if open_roots
|
|
172
|
+
else (trace.root_spans[-1] if trace.root_spans else None)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _resolve_trace_and_root_for_task(t: asyncio.Task):
|
|
177
|
+
"""
|
|
178
|
+
Resolve trace and root for a completed task using the weak binding map.
|
|
179
|
+
|
|
180
|
+
Steps:
|
|
181
|
+
1. Look up the task in `trace_manager.task_bindings` to get the
|
|
182
|
+
bound `trace_uuid` and, if available, `root_span_uuid`.
|
|
183
|
+
2. Resolve the Trace with `_get_trace_by_uuid_anywhere`.
|
|
184
|
+
3. If a bound root UUID exists, try to find that exact root on the trace.
|
|
185
|
+
4. Otherwise, fall back to `_pick_root_for_marking(trace)`.
|
|
186
|
+
|
|
187
|
+
Returns a trace / root tuple. Either may be `None` when no binding is
|
|
188
|
+
present. This function is used by `on_task_done` to robustly mark error/cancel
|
|
189
|
+
states without assuming a single root trace or a root that is still open.
|
|
190
|
+
"""
|
|
191
|
+
binding = trace_manager.task_bindings.get(t) or {}
|
|
192
|
+
trace_uuid = binding.get("trace_uuid")
|
|
193
|
+
root_span_uuid = binding.get("root_span_uuid")
|
|
194
|
+
|
|
195
|
+
trace = _get_trace_by_uuid_anywhere(trace_uuid) if trace_uuid else None
|
|
196
|
+
root = None
|
|
197
|
+
|
|
198
|
+
if trace and root_span_uuid:
|
|
199
|
+
root = next(
|
|
200
|
+
(rs for rs in trace.root_spans if rs.uuid == root_span_uuid), None
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if trace and root is None:
|
|
204
|
+
root = _pick_root_for_marking(trace)
|
|
205
|
+
|
|
206
|
+
return trace, root
|
|
207
|
+
|
|
208
|
+
|
|
96
209
|
async def _snapshot_tasks():
|
|
97
210
|
cur = asyncio.current_task()
|
|
98
211
|
# `all_tasks` returns tasks for the current running loop only
|
|
@@ -111,6 +224,20 @@ def _gather_timeout() -> float:
|
|
|
111
224
|
)
|
|
112
225
|
|
|
113
226
|
|
|
227
|
+
def filter_duplicate_results(
|
|
228
|
+
main_result: TestResult, results: List[TestResult]
|
|
229
|
+
) -> List[TestResult]:
|
|
230
|
+
return [
|
|
231
|
+
result
|
|
232
|
+
for result in results
|
|
233
|
+
if not (
|
|
234
|
+
(result.input == main_result.input)
|
|
235
|
+
and (result.actual_output == main_result.actual_output)
|
|
236
|
+
and (result.metrics_data == main_result.metrics_data)
|
|
237
|
+
)
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
|
|
114
241
|
###########################################
|
|
115
242
|
### E2E Evals #############################
|
|
116
243
|
###########################################
|
|
@@ -376,7 +503,10 @@ async def a_execute_test_cases(
|
|
|
376
503
|
|
|
377
504
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
378
505
|
async with semaphore:
|
|
379
|
-
return await
|
|
506
|
+
return await asyncio.wait_for(
|
|
507
|
+
func(*args, **kwargs),
|
|
508
|
+
timeout=_per_task_timeout(),
|
|
509
|
+
)
|
|
380
510
|
|
|
381
511
|
global_test_run_cache_manager.disable_write_cache = (
|
|
382
512
|
cache_config.write_cache is False
|
|
@@ -495,7 +625,20 @@ async def a_execute_test_cases(
|
|
|
495
625
|
tasks.append(asyncio.create_task(task))
|
|
496
626
|
|
|
497
627
|
await asyncio.sleep(async_config.throttle_value)
|
|
498
|
-
|
|
628
|
+
|
|
629
|
+
try:
|
|
630
|
+
await asyncio.wait_for(
|
|
631
|
+
asyncio.gather(*tasks),
|
|
632
|
+
timeout=_gather_timeout(),
|
|
633
|
+
)
|
|
634
|
+
except asyncio.TimeoutError:
|
|
635
|
+
# Cancel any still-pending tasks and drain them
|
|
636
|
+
for t in tasks:
|
|
637
|
+
if not t.done():
|
|
638
|
+
t.cancel()
|
|
639
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
640
|
+
raise
|
|
641
|
+
|
|
499
642
|
else:
|
|
500
643
|
for test_case in test_cases:
|
|
501
644
|
with capture_evaluation_run("test case"):
|
|
@@ -568,7 +711,19 @@ async def a_execute_test_cases(
|
|
|
568
711
|
tasks.append(asyncio.create_task(task))
|
|
569
712
|
|
|
570
713
|
await asyncio.sleep(async_config.throttle_value)
|
|
571
|
-
|
|
714
|
+
|
|
715
|
+
try:
|
|
716
|
+
await asyncio.wait_for(
|
|
717
|
+
asyncio.gather(*tasks),
|
|
718
|
+
timeout=_gather_timeout(),
|
|
719
|
+
)
|
|
720
|
+
except asyncio.TimeoutError:
|
|
721
|
+
# Cancel any still-pending tasks and drain them
|
|
722
|
+
for t in tasks:
|
|
723
|
+
if not t.done():
|
|
724
|
+
t.cancel()
|
|
725
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
726
|
+
raise
|
|
572
727
|
|
|
573
728
|
return test_results
|
|
574
729
|
|
|
@@ -843,6 +998,7 @@ def execute_agentic_test_cases(
|
|
|
843
998
|
_progress=progress,
|
|
844
999
|
_pbar_callback_id=pbar_tags_id,
|
|
845
1000
|
):
|
|
1001
|
+
|
|
846
1002
|
if asyncio.iscoroutinefunction(observed_callback):
|
|
847
1003
|
loop = get_or_create_event_loop()
|
|
848
1004
|
coro = observed_callback(golden.input)
|
|
@@ -894,14 +1050,16 @@ def execute_agentic_test_cases(
|
|
|
894
1050
|
pbar_eval_id: Optional[int] = None,
|
|
895
1051
|
):
|
|
896
1052
|
# Create API Span
|
|
897
|
-
metrics: List[BaseMetric] = span.metrics
|
|
1053
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
898
1054
|
api_span: BaseApiSpan = (
|
|
899
1055
|
trace_manager._convert_span_to_api_span(span)
|
|
900
1056
|
)
|
|
1057
|
+
|
|
901
1058
|
if isinstance(span, AgentSpan):
|
|
902
1059
|
trace_api.agent_spans.append(api_span)
|
|
903
1060
|
elif isinstance(span, LlmSpan):
|
|
904
1061
|
trace_api.llm_spans.append(api_span)
|
|
1062
|
+
log_prompt(span, test_run_manager)
|
|
905
1063
|
elif isinstance(span, RetrieverSpan):
|
|
906
1064
|
trace_api.retriever_spans.append(api_span)
|
|
907
1065
|
elif isinstance(span, ToolSpan):
|
|
@@ -909,14 +1067,27 @@ def execute_agentic_test_cases(
|
|
|
909
1067
|
else:
|
|
910
1068
|
trace_api.base_spans.append(api_span)
|
|
911
1069
|
|
|
1070
|
+
# Skip errored trace/span
|
|
1071
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
1072
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1073
|
+
api_span.error = span.error or _trace_error(
|
|
1074
|
+
current_trace
|
|
1075
|
+
)
|
|
1076
|
+
if progress and pbar_eval_id is not None:
|
|
1077
|
+
update_pbar(
|
|
1078
|
+
progress,
|
|
1079
|
+
pbar_eval_id,
|
|
1080
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1081
|
+
)
|
|
1082
|
+
return
|
|
1083
|
+
|
|
912
1084
|
for child in span.children:
|
|
913
1085
|
dfs(child, progress, pbar_eval_id)
|
|
914
1086
|
|
|
915
|
-
if span.metrics
|
|
1087
|
+
if not span.metrics:
|
|
916
1088
|
return
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
for metric in span.metrics
|
|
1089
|
+
requires_trace = any(
|
|
1090
|
+
metric.requires_trace for metric in span.metrics
|
|
920
1091
|
)
|
|
921
1092
|
|
|
922
1093
|
llm_test_case = None
|
|
@@ -934,18 +1105,30 @@ def execute_agentic_test_cases(
|
|
|
934
1105
|
tools_called=span.tools_called,
|
|
935
1106
|
expected_tools=span.expected_tools,
|
|
936
1107
|
)
|
|
937
|
-
if llm_test_case is None and not has_task_completion:
|
|
938
|
-
raise ValueError(
|
|
939
|
-
"Unable to run metrics on span without LLMTestCase. Are you sure you called `update_current_span()`?"
|
|
940
|
-
)
|
|
941
1108
|
|
|
942
1109
|
# add trace if task completion
|
|
943
|
-
if
|
|
1110
|
+
if requires_trace:
|
|
944
1111
|
if llm_test_case is None:
|
|
945
1112
|
llm_test_case = LLMTestCase(input="None")
|
|
946
1113
|
llm_test_case._trace_dict = (
|
|
947
1114
|
trace_manager.create_nested_spans_dict(span)
|
|
948
1115
|
)
|
|
1116
|
+
else:
|
|
1117
|
+
if llm_test_case is None:
|
|
1118
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1119
|
+
api_span.error = format_error_text(
|
|
1120
|
+
DeepEvalError(
|
|
1121
|
+
"Span has metrics but no LLMTestCase. "
|
|
1122
|
+
"Are you sure you called `update_current_span()`?"
|
|
1123
|
+
)
|
|
1124
|
+
)
|
|
1125
|
+
if progress and pbar_eval_id is not None:
|
|
1126
|
+
update_pbar(
|
|
1127
|
+
progress,
|
|
1128
|
+
pbar_eval_id,
|
|
1129
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1130
|
+
)
|
|
1131
|
+
return
|
|
949
1132
|
|
|
950
1133
|
# Preparing metric calculation
|
|
951
1134
|
api_span.metrics_data = []
|
|
@@ -984,72 +1167,111 @@ def execute_agentic_test_cases(
|
|
|
984
1167
|
|
|
985
1168
|
start_time = time.perf_counter()
|
|
986
1169
|
|
|
1170
|
+
skip_metrics_for_this_golden = False
|
|
987
1171
|
# Handle trace-level metrics
|
|
988
|
-
if current_trace
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
llm_test_case = LLMTestCase(
|
|
997
|
-
input=str(current_trace.input),
|
|
998
|
-
actual_output=(
|
|
999
|
-
str(current_trace.output)
|
|
1000
|
-
if current_trace.output is not None
|
|
1001
|
-
else None
|
|
1172
|
+
if _skip_metrics_for_error(trace=current_trace):
|
|
1173
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1174
|
+
if progress and pbar_eval_id is not None:
|
|
1175
|
+
update_pbar(
|
|
1176
|
+
progress,
|
|
1177
|
+
pbar_eval_id,
|
|
1178
|
+
advance=count_total_metrics_for_trace(
|
|
1179
|
+
current_trace
|
|
1002
1180
|
),
|
|
1003
|
-
expected_output=current_trace.expected_output,
|
|
1004
|
-
context=current_trace.context,
|
|
1005
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1006
|
-
tools_called=current_trace.tools_called,
|
|
1007
|
-
expected_tools=current_trace.expected_tools,
|
|
1008
1181
|
)
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1182
|
+
else:
|
|
1183
|
+
if current_trace.metrics:
|
|
1184
|
+
requires_trace = any(
|
|
1185
|
+
metric.requires_trace
|
|
1186
|
+
for metric in current_trace.metrics
|
|
1012
1187
|
)
|
|
1013
1188
|
|
|
1014
|
-
|
|
1015
|
-
if
|
|
1016
|
-
llm_test_case = LLMTestCase(
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1189
|
+
llm_test_case = None
|
|
1190
|
+
if current_trace.input:
|
|
1191
|
+
llm_test_case = LLMTestCase(
|
|
1192
|
+
input=str(current_trace.input),
|
|
1193
|
+
actual_output=(
|
|
1194
|
+
str(current_trace.output)
|
|
1195
|
+
if current_trace.output is not None
|
|
1196
|
+
else None
|
|
1197
|
+
),
|
|
1198
|
+
expected_output=current_trace.expected_output,
|
|
1199
|
+
context=current_trace.context,
|
|
1200
|
+
retrieval_context=current_trace.retrieval_context,
|
|
1201
|
+
tools_called=current_trace.tools_called,
|
|
1202
|
+
expected_tools=current_trace.expected_tools,
|
|
1020
1203
|
)
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1204
|
+
if requires_trace:
|
|
1205
|
+
if llm_test_case is None:
|
|
1206
|
+
llm_test_case = LLMTestCase(input="None")
|
|
1207
|
+
llm_test_case._trace_dict = (
|
|
1208
|
+
trace_manager.create_nested_spans_dict(
|
|
1209
|
+
current_trace.root_spans[0]
|
|
1210
|
+
)
|
|
1211
|
+
)
|
|
1212
|
+
else:
|
|
1213
|
+
if llm_test_case is None:
|
|
1214
|
+
current_trace.status = TraceSpanStatus.ERRORED
|
|
1215
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1216
|
+
if current_trace.root_spans:
|
|
1217
|
+
current_trace.root_spans[0].status = (
|
|
1218
|
+
TraceSpanStatus.ERRORED
|
|
1219
|
+
)
|
|
1220
|
+
current_trace.root_spans[0].error = (
|
|
1221
|
+
format_error_text(
|
|
1222
|
+
DeepEvalError(
|
|
1223
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
1224
|
+
"Are you sure you called `update_current_trace()`?"
|
|
1225
|
+
)
|
|
1226
|
+
)
|
|
1227
|
+
)
|
|
1228
|
+
if progress and pbar_eval_id is not None:
|
|
1229
|
+
update_pbar(
|
|
1230
|
+
progress,
|
|
1231
|
+
pbar_eval_id,
|
|
1232
|
+
advance=count_total_metrics_for_trace(
|
|
1233
|
+
current_trace
|
|
1234
|
+
),
|
|
1235
|
+
)
|
|
1236
|
+
skip_metrics_for_this_golden = True
|
|
1237
|
+
|
|
1238
|
+
if not skip_metrics_for_this_golden:
|
|
1239
|
+
for metric in current_trace.metrics:
|
|
1240
|
+
metric.skipped = False
|
|
1241
|
+
metric.error = None
|
|
1242
|
+
if display_config.verbose_mode is not None:
|
|
1243
|
+
metric.verbose_mode = (
|
|
1244
|
+
display_config.verbose_mode
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1247
|
+
trace_api.metrics_data = []
|
|
1248
|
+
for metric in current_trace.metrics:
|
|
1249
|
+
res = _execute_metric(
|
|
1250
|
+
metric=metric,
|
|
1251
|
+
test_case=llm_test_case,
|
|
1252
|
+
show_metric_indicator=show_metric_indicator,
|
|
1253
|
+
in_component=True,
|
|
1254
|
+
error_config=error_config,
|
|
1255
|
+
)
|
|
1256
|
+
if res == "skip":
|
|
1257
|
+
continue
|
|
1258
|
+
|
|
1259
|
+
if not metric.skipped:
|
|
1260
|
+
metric_data = create_metric_data(metric)
|
|
1261
|
+
trace_api.metrics_data.append(metric_data)
|
|
1262
|
+
api_test_case.update_metric_data(
|
|
1263
|
+
metric_data
|
|
1264
|
+
)
|
|
1265
|
+
api_test_case.update_status(
|
|
1266
|
+
metric_data.success
|
|
1267
|
+
)
|
|
1268
|
+
update_pbar(progress, pbar_eval_id)
|
|
1269
|
+
|
|
1270
|
+
# Then handle span-level metrics
|
|
1271
|
+
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
1047
1272
|
|
|
1048
|
-
# Then handle span-level metrics
|
|
1049
|
-
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
1050
1273
|
end_time = time.perf_counter()
|
|
1051
1274
|
run_duration = end_time - start_time
|
|
1052
|
-
|
|
1053
1275
|
# Update test run
|
|
1054
1276
|
api_test_case.update_run_duration(run_duration)
|
|
1055
1277
|
test_run_manager.update_test_run(api_test_case, test_case)
|
|
@@ -1097,7 +1319,10 @@ async def a_execute_agentic_test_cases(
|
|
|
1097
1319
|
|
|
1098
1320
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
1099
1321
|
async with semaphore:
|
|
1100
|
-
return await
|
|
1322
|
+
return await asyncio.wait_for(
|
|
1323
|
+
func(*args, **kwargs),
|
|
1324
|
+
timeout=_per_task_timeout(),
|
|
1325
|
+
)
|
|
1101
1326
|
|
|
1102
1327
|
test_run_manager = global_test_run_manager
|
|
1103
1328
|
test_run_manager.save_to_disk = cache_config.write_cache
|
|
@@ -1144,7 +1369,19 @@ async def a_execute_agentic_test_cases(
|
|
|
1144
1369
|
tasks.append(asyncio.create_task(task))
|
|
1145
1370
|
await asyncio.sleep(async_config.throttle_value)
|
|
1146
1371
|
|
|
1147
|
-
|
|
1372
|
+
try:
|
|
1373
|
+
await asyncio.wait_for(
|
|
1374
|
+
asyncio.gather(*tasks),
|
|
1375
|
+
timeout=_gather_timeout(),
|
|
1376
|
+
)
|
|
1377
|
+
except asyncio.TimeoutError:
|
|
1378
|
+
# Cancel any still-pending tasks and drain them
|
|
1379
|
+
for t in tasks:
|
|
1380
|
+
if not t.done():
|
|
1381
|
+
t.cancel()
|
|
1382
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
1383
|
+
raise
|
|
1384
|
+
|
|
1148
1385
|
else:
|
|
1149
1386
|
for golden in goldens:
|
|
1150
1387
|
with capture_evaluation_run("golden"):
|
|
@@ -1261,7 +1498,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1261
1498
|
)
|
|
1262
1499
|
|
|
1263
1500
|
await _a_execute_trace_test_case(
|
|
1264
|
-
trace=
|
|
1501
|
+
trace=current_trace,
|
|
1265
1502
|
trace_api=trace_api,
|
|
1266
1503
|
api_test_case=api_test_case,
|
|
1267
1504
|
ignore_errors=ignore_errors,
|
|
@@ -1273,9 +1510,10 @@ async def _a_execute_agentic_test_case(
|
|
|
1273
1510
|
_use_bar_indicator=_use_bar_indicator,
|
|
1274
1511
|
)
|
|
1275
1512
|
|
|
1276
|
-
async def dfs(span: BaseSpan):
|
|
1513
|
+
async def dfs(trace: Trace, span: BaseSpan):
|
|
1277
1514
|
await _a_execute_span_test_case(
|
|
1278
1515
|
span=span,
|
|
1516
|
+
current_trace=trace,
|
|
1279
1517
|
trace_api=trace_api,
|
|
1280
1518
|
api_test_case=api_test_case,
|
|
1281
1519
|
ignore_errors=ignore_errors,
|
|
@@ -1284,27 +1522,61 @@ async def _a_execute_agentic_test_case(
|
|
|
1284
1522
|
verbose_mode=verbose_mode,
|
|
1285
1523
|
progress=progress,
|
|
1286
1524
|
pbar_eval_id=pbar_eval_id,
|
|
1525
|
+
test_run_manager=test_run_manager,
|
|
1287
1526
|
_use_bar_indicator=_use_bar_indicator,
|
|
1288
1527
|
)
|
|
1289
|
-
|
|
1528
|
+
|
|
1529
|
+
if _skip_metrics_for_error(span=span, trace=trace):
|
|
1530
|
+
return
|
|
1531
|
+
|
|
1532
|
+
child_tasks = [
|
|
1533
|
+
asyncio.create_task(dfs(trace, child)) for child in span.children
|
|
1534
|
+
]
|
|
1290
1535
|
if child_tasks:
|
|
1291
|
-
|
|
1536
|
+
try:
|
|
1537
|
+
await asyncio.wait_for(
|
|
1538
|
+
asyncio.gather(*child_tasks),
|
|
1539
|
+
timeout=_gather_timeout(),
|
|
1540
|
+
)
|
|
1541
|
+
except asyncio.TimeoutError:
|
|
1542
|
+
for t in child_tasks:
|
|
1543
|
+
if not t.done():
|
|
1544
|
+
t.cancel()
|
|
1545
|
+
await asyncio.gather(*child_tasks, return_exceptions=True)
|
|
1546
|
+
raise
|
|
1292
1547
|
|
|
1293
1548
|
test_start_time = time.perf_counter()
|
|
1294
|
-
|
|
1549
|
+
|
|
1550
|
+
if not _skip_metrics_for_error(trace=current_trace):
|
|
1551
|
+
if current_trace and current_trace.root_spans:
|
|
1552
|
+
await dfs(current_trace, current_trace.root_spans[0])
|
|
1553
|
+
else:
|
|
1554
|
+
if (
|
|
1555
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
1556
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
1557
|
+
):
|
|
1558
|
+
logger.debug(
|
|
1559
|
+
"Skipping DFS: empty trace or no root spans (trace=%s)",
|
|
1560
|
+
current_trace.uuid if current_trace else None,
|
|
1561
|
+
)
|
|
1562
|
+
|
|
1295
1563
|
test_end_time = time.perf_counter()
|
|
1296
1564
|
run_duration = test_end_time - test_start_time
|
|
1297
1565
|
|
|
1298
1566
|
api_test_case.update_run_duration(run_duration)
|
|
1299
1567
|
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1300
|
-
|
|
1301
|
-
|
|
1568
|
+
main_result = create_test_result(api_test_case)
|
|
1569
|
+
trace_results = extract_trace_test_results(trace_api)
|
|
1570
|
+
unique_trace_results = filter_duplicate_results(main_result, trace_results)
|
|
1571
|
+
test_results.append(main_result)
|
|
1572
|
+
test_results.extend(unique_trace_results)
|
|
1302
1573
|
|
|
1303
1574
|
update_pbar(progress, pbar_id)
|
|
1304
1575
|
|
|
1305
1576
|
|
|
1306
1577
|
async def _a_execute_span_test_case(
|
|
1307
1578
|
span: BaseSpan,
|
|
1579
|
+
current_trace: Trace,
|
|
1308
1580
|
trace_api: TraceApi,
|
|
1309
1581
|
api_test_case: LLMApiTestCase,
|
|
1310
1582
|
ignore_errors: bool,
|
|
@@ -1313,6 +1585,7 @@ async def _a_execute_span_test_case(
|
|
|
1313
1585
|
verbose_mode: Optional[bool],
|
|
1314
1586
|
progress: Optional[Progress],
|
|
1315
1587
|
pbar_eval_id: Optional[int],
|
|
1588
|
+
test_run_manager: Optional[TestRunManager],
|
|
1316
1589
|
_use_bar_indicator: bool,
|
|
1317
1590
|
):
|
|
1318
1591
|
api_span: BaseApiSpan = trace_manager._convert_span_to_api_span(span)
|
|
@@ -1320,6 +1593,7 @@ async def _a_execute_span_test_case(
|
|
|
1320
1593
|
trace_api.agent_spans.append(api_span)
|
|
1321
1594
|
elif isinstance(span, LlmSpan):
|
|
1322
1595
|
trace_api.llm_spans.append(api_span)
|
|
1596
|
+
log_prompt(span, test_run_manager)
|
|
1323
1597
|
elif isinstance(span, RetrieverSpan):
|
|
1324
1598
|
trace_api.retriever_spans.append(api_span)
|
|
1325
1599
|
elif isinstance(span, ToolSpan):
|
|
@@ -1327,12 +1601,22 @@ async def _a_execute_span_test_case(
|
|
|
1327
1601
|
else:
|
|
1328
1602
|
trace_api.base_spans.append(api_span)
|
|
1329
1603
|
|
|
1330
|
-
if span
|
|
1604
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
1605
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1606
|
+
api_span.error = span.error or _trace_error(current_trace)
|
|
1607
|
+
if progress and pbar_eval_id is not None:
|
|
1608
|
+
update_pbar(
|
|
1609
|
+
progress,
|
|
1610
|
+
pbar_eval_id,
|
|
1611
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1612
|
+
)
|
|
1331
1613
|
return
|
|
1332
1614
|
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1615
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
1616
|
+
if not metrics:
|
|
1617
|
+
return
|
|
1618
|
+
|
|
1619
|
+
requires_trace = any(metric.requires_trace for metric in metrics)
|
|
1336
1620
|
|
|
1337
1621
|
llm_test_case = None
|
|
1338
1622
|
if span.input:
|
|
@@ -1345,17 +1629,29 @@ async def _a_execute_span_test_case(
|
|
|
1345
1629
|
tools_called=span.tools_called,
|
|
1346
1630
|
expected_tools=span.expected_tools,
|
|
1347
1631
|
)
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1632
|
+
|
|
1633
|
+
if not requires_trace:
|
|
1634
|
+
if llm_test_case is None:
|
|
1635
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1636
|
+
api_span.error = format_error_text(
|
|
1637
|
+
DeepEvalError(
|
|
1638
|
+
"Span has metrics but no LLMTestCase. "
|
|
1639
|
+
"Are you sure you called `update_current_span()`?"
|
|
1640
|
+
)
|
|
1641
|
+
)
|
|
1642
|
+
if progress and pbar_eval_id is not None:
|
|
1643
|
+
update_pbar(
|
|
1644
|
+
progress,
|
|
1645
|
+
pbar_eval_id,
|
|
1646
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1647
|
+
)
|
|
1648
|
+
return
|
|
1352
1649
|
|
|
1353
1650
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
1354
|
-
metrics: List[BaseMetric] = span.metrics
|
|
1355
1651
|
test_case: Optional[LLMTestCase] = llm_test_case
|
|
1356
1652
|
|
|
1357
1653
|
# add trace if task completion
|
|
1358
|
-
if
|
|
1654
|
+
if requires_trace:
|
|
1359
1655
|
if test_case is None:
|
|
1360
1656
|
test_case = LLMTestCase(input="None")
|
|
1361
1657
|
test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
|
|
@@ -1399,12 +1695,22 @@ async def _a_execute_trace_test_case(
|
|
|
1399
1695
|
pbar_eval_id: Optional[int],
|
|
1400
1696
|
_use_bar_indicator: bool,
|
|
1401
1697
|
):
|
|
1402
|
-
|
|
1698
|
+
|
|
1699
|
+
if _skip_metrics_for_error(trace=trace):
|
|
1700
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1701
|
+
if progress and pbar_eval_id is not None:
|
|
1702
|
+
update_pbar(
|
|
1703
|
+
progress,
|
|
1704
|
+
pbar_eval_id,
|
|
1705
|
+
advance=count_total_metrics_for_trace(trace),
|
|
1706
|
+
)
|
|
1403
1707
|
return
|
|
1404
1708
|
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1709
|
+
metrics: List[BaseMetric] = list(trace.metrics or [])
|
|
1710
|
+
if not metrics:
|
|
1711
|
+
return
|
|
1712
|
+
|
|
1713
|
+
requires_trace = any(metric.requires_trace for metric in metrics)
|
|
1408
1714
|
|
|
1409
1715
|
llm_test_case = None
|
|
1410
1716
|
if trace.input:
|
|
@@ -1419,17 +1725,32 @@ async def _a_execute_trace_test_case(
|
|
|
1419
1725
|
tools_called=trace.tools_called,
|
|
1420
1726
|
expected_tools=trace.expected_tools,
|
|
1421
1727
|
)
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1728
|
+
|
|
1729
|
+
if not requires_trace:
|
|
1730
|
+
if llm_test_case is None:
|
|
1731
|
+
trace.status = TraceSpanStatus.ERRORED
|
|
1732
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1733
|
+
if trace.root_spans:
|
|
1734
|
+
trace.root_spans[0].status = TraceSpanStatus.ERRORED
|
|
1735
|
+
trace.root_spans[0].error = format_error_text(
|
|
1736
|
+
DeepEvalError(
|
|
1737
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
1738
|
+
"Are you sure you called `update_current_trace()`?"
|
|
1739
|
+
)
|
|
1740
|
+
)
|
|
1741
|
+
if progress and pbar_eval_id is not None:
|
|
1742
|
+
update_pbar(
|
|
1743
|
+
progress,
|
|
1744
|
+
pbar_eval_id,
|
|
1745
|
+
advance=count_total_metrics_for_trace(trace),
|
|
1746
|
+
)
|
|
1747
|
+
return
|
|
1426
1748
|
|
|
1427
1749
|
show_metrics_indicator = show_indicator and not _use_bar_indicator
|
|
1428
|
-
metrics: List[BaseMetric] = trace.metrics
|
|
1429
1750
|
test_case: Optional[LLMTestCase] = llm_test_case
|
|
1430
1751
|
|
|
1431
1752
|
# add trace if task completion
|
|
1432
|
-
if
|
|
1753
|
+
if requires_trace:
|
|
1433
1754
|
if test_case is None:
|
|
1434
1755
|
test_case = LLMTestCase(input="None")
|
|
1435
1756
|
test_case._trace_dict = trace_manager.create_nested_spans_dict(
|
|
@@ -1559,15 +1880,17 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1559
1880
|
pbar_eval_id: Optional[int] = None,
|
|
1560
1881
|
):
|
|
1561
1882
|
# Create API Span
|
|
1562
|
-
metrics: List[BaseMetric] = span.metrics
|
|
1883
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
1563
1884
|
|
|
1564
1885
|
api_span: BaseApiSpan = (
|
|
1565
1886
|
trace_manager._convert_span_to_api_span(span)
|
|
1566
1887
|
)
|
|
1888
|
+
|
|
1567
1889
|
if isinstance(span, AgentSpan):
|
|
1568
1890
|
trace_api.agent_spans.append(api_span)
|
|
1569
1891
|
elif isinstance(span, LlmSpan):
|
|
1570
1892
|
trace_api.llm_spans.append(api_span)
|
|
1893
|
+
log_prompt(span, test_run_manager)
|
|
1571
1894
|
elif isinstance(span, RetrieverSpan):
|
|
1572
1895
|
trace_api.retriever_spans.append(api_span)
|
|
1573
1896
|
elif isinstance(span, ToolSpan):
|
|
@@ -1575,9 +1898,30 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1575
1898
|
else:
|
|
1576
1899
|
trace_api.base_spans.append(api_span)
|
|
1577
1900
|
|
|
1901
|
+
# Skip errored trace/span
|
|
1902
|
+
if _skip_metrics_for_error(span=span, trace=current_trace):
|
|
1903
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1904
|
+
api_span.error = span.error or _trace_error(
|
|
1905
|
+
current_trace
|
|
1906
|
+
)
|
|
1907
|
+
if progress and pbar_eval_id is not None:
|
|
1908
|
+
update_pbar(
|
|
1909
|
+
progress,
|
|
1910
|
+
pbar_eval_id,
|
|
1911
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1912
|
+
)
|
|
1913
|
+
return
|
|
1914
|
+
|
|
1578
1915
|
for child in span.children:
|
|
1579
1916
|
dfs(child, progress, pbar_eval_id)
|
|
1580
1917
|
|
|
1918
|
+
if not span.metrics:
|
|
1919
|
+
return
|
|
1920
|
+
|
|
1921
|
+
requires_trace = any(
|
|
1922
|
+
metric.requires_trace for metric in metrics
|
|
1923
|
+
)
|
|
1924
|
+
|
|
1581
1925
|
llm_test_case = None
|
|
1582
1926
|
if span.input is not None:
|
|
1583
1927
|
llm_test_case = LLMTestCase(
|
|
@@ -1593,20 +1937,29 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1593
1937
|
tools_called=span.tools_called,
|
|
1594
1938
|
expected_tools=span.expected_tools,
|
|
1595
1939
|
)
|
|
1596
|
-
if span.metrics is None or llm_test_case is None:
|
|
1597
|
-
return
|
|
1598
1940
|
|
|
1599
|
-
|
|
1600
|
-
isinstance(metric, TaskCompletionMetric)
|
|
1601
|
-
for metric in metrics
|
|
1602
|
-
)
|
|
1603
|
-
|
|
1604
|
-
if has_task_completion:
|
|
1941
|
+
if requires_trace:
|
|
1605
1942
|
if llm_test_case is None:
|
|
1606
1943
|
llm_test_case = LLMTestCase(input="None")
|
|
1607
1944
|
llm_test_case._trace_dict = (
|
|
1608
1945
|
trace_manager.create_nested_spans_dict(span)
|
|
1609
1946
|
)
|
|
1947
|
+
else:
|
|
1948
|
+
if llm_test_case is None:
|
|
1949
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1950
|
+
api_span.error = format_error_text(
|
|
1951
|
+
DeepEvalError(
|
|
1952
|
+
"Span has metrics but no LLMTestCase. "
|
|
1953
|
+
"Are you sure you called `update_current_span()`?"
|
|
1954
|
+
)
|
|
1955
|
+
)
|
|
1956
|
+
if progress and pbar_eval_id is not None:
|
|
1957
|
+
update_pbar(
|
|
1958
|
+
progress,
|
|
1959
|
+
pbar_eval_id,
|
|
1960
|
+
advance=count_metrics_in_span_subtree(span),
|
|
1961
|
+
)
|
|
1962
|
+
return
|
|
1610
1963
|
|
|
1611
1964
|
# Preparing metric calculation
|
|
1612
1965
|
api_span.metrics_data = []
|
|
@@ -1650,77 +2003,123 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1650
2003
|
start_time = time.perf_counter()
|
|
1651
2004
|
|
|
1652
2005
|
# Handle trace-level metrics
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
input=str(current_trace.input),
|
|
1663
|
-
actual_output=(
|
|
1664
|
-
str(current_trace.output)
|
|
1665
|
-
if current_trace.output is not None
|
|
1666
|
-
else None
|
|
2006
|
+
skip_metrics_for_this_golden = False
|
|
2007
|
+
if _skip_metrics_for_error(trace=current_trace):
|
|
2008
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2009
|
+
if progress and pbar_eval_id is not None:
|
|
2010
|
+
update_pbar(
|
|
2011
|
+
progress,
|
|
2012
|
+
pbar_eval_id,
|
|
2013
|
+
advance=count_total_metrics_for_trace(
|
|
2014
|
+
current_trace
|
|
1667
2015
|
),
|
|
1668
|
-
expected_output=current_trace.expected_output,
|
|
1669
|
-
context=current_trace.context,
|
|
1670
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1671
|
-
tools_called=current_trace.tools_called,
|
|
1672
|
-
expected_tools=current_trace.expected_tools,
|
|
1673
2016
|
)
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
2017
|
+
else:
|
|
2018
|
+
if current_trace.metrics:
|
|
2019
|
+
requires_trace = any(
|
|
2020
|
+
metric.requires_trace
|
|
2021
|
+
for metric in current_trace.metrics
|
|
1677
2022
|
)
|
|
1678
2023
|
|
|
1679
|
-
|
|
1680
|
-
if
|
|
1681
|
-
llm_test_case = LLMTestCase(
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
2024
|
+
llm_test_case = None
|
|
2025
|
+
if current_trace.input:
|
|
2026
|
+
llm_test_case = LLMTestCase(
|
|
2027
|
+
input=str(current_trace.input),
|
|
2028
|
+
actual_output=(
|
|
2029
|
+
str(current_trace.output)
|
|
2030
|
+
if current_trace.output is not None
|
|
2031
|
+
else None
|
|
2032
|
+
),
|
|
2033
|
+
expected_output=current_trace.expected_output,
|
|
2034
|
+
context=current_trace.context,
|
|
2035
|
+
retrieval_context=current_trace.retrieval_context,
|
|
2036
|
+
tools_called=current_trace.tools_called,
|
|
2037
|
+
expected_tools=current_trace.expected_tools,
|
|
1685
2038
|
)
|
|
1686
|
-
)
|
|
1687
|
-
|
|
1688
|
-
for metric in current_trace.metrics:
|
|
1689
|
-
metric.skipped = False
|
|
1690
|
-
metric.error = None
|
|
1691
|
-
if display_config.verbose_mode is not None:
|
|
1692
|
-
metric.verbose_mode = display_config.verbose_mode
|
|
1693
|
-
|
|
1694
|
-
trace_api.metrics_data = []
|
|
1695
|
-
for metric in current_trace.metrics:
|
|
1696
|
-
res = _execute_metric(
|
|
1697
|
-
metric=metric,
|
|
1698
|
-
test_case=llm_test_case,
|
|
1699
|
-
show_metric_indicator=show_metric_indicator,
|
|
1700
|
-
in_component=True,
|
|
1701
|
-
error_config=error_config,
|
|
1702
|
-
)
|
|
1703
|
-
if res == "skip":
|
|
1704
|
-
continue
|
|
1705
|
-
|
|
1706
|
-
if not metric.skipped:
|
|
1707
|
-
metric_data = create_metric_data(metric)
|
|
1708
|
-
trace_api.metrics_data.append(metric_data)
|
|
1709
|
-
api_test_case.update_metric_data(metric_data)
|
|
1710
|
-
api_test_case.update_status(metric_data.success)
|
|
1711
|
-
update_pbar(progress, pbar_eval_id)
|
|
1712
2039
|
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
2040
|
+
if requires_trace:
|
|
2041
|
+
if llm_test_case is None:
|
|
2042
|
+
llm_test_case = LLMTestCase(input="None")
|
|
2043
|
+
llm_test_case._trace_dict = (
|
|
2044
|
+
trace_manager.create_nested_spans_dict(
|
|
2045
|
+
current_trace.root_spans[0]
|
|
2046
|
+
)
|
|
2047
|
+
)
|
|
2048
|
+
else:
|
|
2049
|
+
if llm_test_case is None:
|
|
2050
|
+
current_trace.status = TraceSpanStatus.ERRORED
|
|
2051
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
2052
|
+
if current_trace.root_spans:
|
|
2053
|
+
current_trace.root_spans[0].status = (
|
|
2054
|
+
TraceSpanStatus.ERRORED
|
|
2055
|
+
)
|
|
2056
|
+
current_trace.root_spans[0].error = (
|
|
2057
|
+
format_error_text(
|
|
2058
|
+
DeepEvalError(
|
|
2059
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
2060
|
+
"Are you sure you called `update_current_trace()`?"
|
|
2061
|
+
)
|
|
2062
|
+
)
|
|
2063
|
+
)
|
|
2064
|
+
if progress and pbar_eval_id is not None:
|
|
2065
|
+
update_pbar(
|
|
2066
|
+
progress,
|
|
2067
|
+
pbar_eval_id,
|
|
2068
|
+
advance=count_total_metrics_for_trace(
|
|
2069
|
+
current_trace
|
|
2070
|
+
),
|
|
2071
|
+
)
|
|
2072
|
+
skip_metrics_for_this_golden = True
|
|
2073
|
+
|
|
2074
|
+
if not skip_metrics_for_this_golden:
|
|
2075
|
+
for metric in current_trace.metrics:
|
|
2076
|
+
metric.skipped = False
|
|
2077
|
+
metric.error = None
|
|
2078
|
+
if display_config.verbose_mode is not None:
|
|
2079
|
+
metric.verbose_mode = (
|
|
2080
|
+
display_config.verbose_mode
|
|
2081
|
+
)
|
|
2082
|
+
|
|
2083
|
+
trace_api.metrics_data = []
|
|
2084
|
+
for metric in current_trace.metrics:
|
|
2085
|
+
res = _execute_metric(
|
|
2086
|
+
metric=metric,
|
|
2087
|
+
test_case=llm_test_case,
|
|
2088
|
+
show_metric_indicator=show_metric_indicator,
|
|
2089
|
+
in_component=True,
|
|
2090
|
+
error_config=error_config,
|
|
2091
|
+
)
|
|
2092
|
+
if res == "skip":
|
|
2093
|
+
continue
|
|
2094
|
+
|
|
2095
|
+
if not metric.skipped:
|
|
2096
|
+
metric_data = create_metric_data(metric)
|
|
2097
|
+
trace_api.metrics_data.append(metric_data)
|
|
2098
|
+
api_test_case.update_metric_data(
|
|
2099
|
+
metric_data
|
|
2100
|
+
)
|
|
2101
|
+
api_test_case.update_status(
|
|
2102
|
+
metric_data.success
|
|
2103
|
+
)
|
|
2104
|
+
update_pbar(progress, pbar_eval_id)
|
|
2105
|
+
|
|
2106
|
+
# Then handle span-level metrics
|
|
2107
|
+
dfs(current_trace.root_spans[0], progress, pbar_eval_id)
|
|
2108
|
+
|
|
2109
|
+
end_time = time.perf_counter()
|
|
2110
|
+
run_duration = end_time - start_time
|
|
2111
|
+
# Update test run
|
|
2112
|
+
api_test_case.update_run_duration(run_duration)
|
|
2113
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
2114
|
+
main_result = create_test_result(api_test_case)
|
|
2115
|
+
trace_results = extract_trace_test_results(trace_api)
|
|
2116
|
+
unique_trace_results = filter_duplicate_results(
|
|
2117
|
+
main_result, trace_results
|
|
2118
|
+
)
|
|
2119
|
+
test_results.append(main_result)
|
|
2120
|
+
test_results.extend(unique_trace_results)
|
|
1722
2121
|
|
|
1723
|
-
|
|
2122
|
+
update_pbar(progress, pbar_id)
|
|
1724
2123
|
|
|
1725
2124
|
try:
|
|
1726
2125
|
if display_config.show_indicator and _use_bar_indicator:
|
|
@@ -1748,6 +2147,7 @@ def execute_agentic_test_cases_from_loop(
|
|
|
1748
2147
|
local_trace_manager.evaluating = False
|
|
1749
2148
|
local_trace_manager.traces_to_evaluate_order.clear()
|
|
1750
2149
|
local_trace_manager.traces_to_evaluate.clear()
|
|
2150
|
+
local_trace_manager.trace_uuid_to_golden.clear()
|
|
1751
2151
|
|
|
1752
2152
|
|
|
1753
2153
|
def a_execute_agentic_test_cases_from_loop(
|
|
@@ -1820,39 +2220,137 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1820
2220
|
}
|
|
1821
2221
|
|
|
1822
2222
|
def on_task_done(t: asyncio.Task):
|
|
2223
|
+
cancelled = False
|
|
2224
|
+
exc = None
|
|
2225
|
+
trace = None
|
|
2226
|
+
root = None
|
|
2227
|
+
resolved_trace_from_task = False
|
|
2228
|
+
resolved_root_from_task = False
|
|
2229
|
+
|
|
2230
|
+
# Task.exception() raises CancelledError if task was cancelled
|
|
2231
|
+
try:
|
|
2232
|
+
exc = t.exception()
|
|
2233
|
+
except asyncio.CancelledError:
|
|
2234
|
+
cancelled = True
|
|
2235
|
+
exc = None
|
|
2236
|
+
|
|
2237
|
+
meta = task_meta.get(t, {})
|
|
2238
|
+
golden_index = meta.get("golden_index")
|
|
2239
|
+
|
|
2240
|
+
if golden_index is not None and 0 <= golden_index < len(
|
|
2241
|
+
goldens
|
|
2242
|
+
):
|
|
2243
|
+
golden = goldens[golden_index]
|
|
2244
|
+
|
|
2245
|
+
def _mark_trace_error(trace, root, msg: str):
|
|
2246
|
+
now = time.perf_counter()
|
|
2247
|
+
trace.status = TraceSpanStatus.ERRORED
|
|
2248
|
+
# Close the trace so the API layer has a proper endTime
|
|
2249
|
+
if trace.end_time is None:
|
|
2250
|
+
trace.end_time = now
|
|
2251
|
+
if root:
|
|
2252
|
+
root.status = TraceSpanStatus.ERRORED
|
|
2253
|
+
root.error = msg
|
|
2254
|
+
if root.end_time is None:
|
|
2255
|
+
root.end_time = now
|
|
2256
|
+
|
|
2257
|
+
if exc is not None:
|
|
2258
|
+
msg = format_error_text(exc)
|
|
2259
|
+
trace, root = _resolve_trace_and_root_for_task(t)
|
|
2260
|
+
resolved_trace_from_task = bool(trace)
|
|
2261
|
+
resolved_root_from_task = bool(root)
|
|
2262
|
+
if trace:
|
|
2263
|
+
_mark_trace_error(trace, root, msg)
|
|
2264
|
+
else:
|
|
2265
|
+
for (
|
|
2266
|
+
trace
|
|
2267
|
+
) in trace_manager.integration_traces_to_evaluate:
|
|
2268
|
+
if (
|
|
2269
|
+
trace_manager.trace_uuid_to_golden.get(
|
|
2270
|
+
trace.uuid
|
|
2271
|
+
)
|
|
2272
|
+
is golden
|
|
2273
|
+
):
|
|
2274
|
+
root = _pick_root_for_marking(trace)
|
|
2275
|
+
_mark_trace_error(trace, root, msg)
|
|
2276
|
+
break
|
|
2277
|
+
|
|
2278
|
+
elif cancelled or t.cancelled():
|
|
2279
|
+
cancel_exc = DeepEvalError(
|
|
2280
|
+
"Task was cancelled (likely due to timeout)."
|
|
2281
|
+
)
|
|
2282
|
+
msg = format_error_text(cancel_exc)
|
|
2283
|
+
trace, root = _resolve_trace_and_root_for_task(t)
|
|
2284
|
+
resolved_trace_from_task = bool(trace)
|
|
2285
|
+
resolved_root_from_task = bool(root)
|
|
2286
|
+
if trace:
|
|
2287
|
+
_mark_trace_error(trace, root, msg)
|
|
2288
|
+
else:
|
|
2289
|
+
for (
|
|
2290
|
+
trace
|
|
2291
|
+
) in trace_manager.integration_traces_to_evaluate:
|
|
2292
|
+
if (
|
|
2293
|
+
trace_manager.trace_uuid_to_golden.get(
|
|
2294
|
+
trace.uuid
|
|
2295
|
+
)
|
|
2296
|
+
is golden
|
|
2297
|
+
):
|
|
2298
|
+
root = _pick_root_for_marking(trace)
|
|
2299
|
+
_mark_trace_error(trace, root, msg)
|
|
2300
|
+
break
|
|
2301
|
+
|
|
1823
2302
|
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1824
2303
|
# Using info level here to make it easy to spot these logs.
|
|
1825
|
-
|
|
1826
|
-
meta = task_meta.get(t, {})
|
|
2304
|
+
golden_name = meta.get("golden_name")
|
|
1827
2305
|
duration = time.perf_counter() - meta.get(
|
|
1828
2306
|
"started", started
|
|
1829
2307
|
)
|
|
1830
2308
|
|
|
1831
|
-
if
|
|
2309
|
+
if cancelled or exc is not None:
|
|
2310
|
+
if not resolved_trace_from_task:
|
|
2311
|
+
logger.warning(
|
|
2312
|
+
"[deepeval] on_task_done: no binding for task; falling back to golden->trace. task=%s golden=%r",
|
|
2313
|
+
t.get_name(),
|
|
2314
|
+
golden_name,
|
|
2315
|
+
)
|
|
2316
|
+
elif not resolved_root_from_task:
|
|
2317
|
+
logger.warning(
|
|
2318
|
+
"[deepeval] on_task_done: bound trace found but no bound root; using heuristic. task=%s trace=%s",
|
|
2319
|
+
t.get_name(),
|
|
2320
|
+
trace.uuid,
|
|
2321
|
+
)
|
|
2322
|
+
|
|
2323
|
+
if cancelled:
|
|
1832
2324
|
logger.info(
|
|
1833
2325
|
"[deepeval] task CANCELLED %s after %.2fs meta=%r",
|
|
1834
2326
|
t.get_name(),
|
|
1835
2327
|
duration,
|
|
1836
2328
|
meta,
|
|
1837
2329
|
)
|
|
2330
|
+
elif exc is not None:
|
|
2331
|
+
logger.error(
|
|
2332
|
+
"[deepeval] task ERROR %s after %.2fs meta=%r",
|
|
2333
|
+
t.get_name(),
|
|
2334
|
+
duration,
|
|
2335
|
+
meta,
|
|
2336
|
+
exc_info=(
|
|
2337
|
+
type(exc),
|
|
2338
|
+
exc,
|
|
2339
|
+
getattr(exc, "__traceback__", None),
|
|
2340
|
+
),
|
|
2341
|
+
)
|
|
1838
2342
|
else:
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
meta,
|
|
1846
|
-
exc_info=(type(exc), exc, exc.__traceback__),
|
|
1847
|
-
)
|
|
1848
|
-
else:
|
|
1849
|
-
logger.info(
|
|
1850
|
-
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
1851
|
-
t.get_name(),
|
|
1852
|
-
duration,
|
|
1853
|
-
meta.get("golden_index"),
|
|
1854
|
-
)
|
|
2343
|
+
logger.info(
|
|
2344
|
+
"[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
|
|
2345
|
+
t.get_name(),
|
|
2346
|
+
duration,
|
|
2347
|
+
meta.get("golden_index"),
|
|
2348
|
+
)
|
|
1855
2349
|
|
|
2350
|
+
try:
|
|
2351
|
+
trace_manager.task_bindings.pop(t, None)
|
|
2352
|
+
except Exception:
|
|
2353
|
+
pass
|
|
1856
2354
|
update_pbar(progress, pbar_callback_id)
|
|
1857
2355
|
update_pbar(progress, pbar_id)
|
|
1858
2356
|
|
|
@@ -1897,6 +2395,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1897
2395
|
timeout=_gather_timeout(),
|
|
1898
2396
|
)
|
|
1899
2397
|
)
|
|
2398
|
+
|
|
1900
2399
|
except asyncio.TimeoutError:
|
|
1901
2400
|
import traceback
|
|
1902
2401
|
|
|
@@ -1950,12 +2449,12 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1950
2449
|
return
|
|
1951
2450
|
|
|
1952
2451
|
try:
|
|
2452
|
+
current_tasks = set()
|
|
1953
2453
|
# Find tasks that were created during this run but we didn’t track
|
|
1954
2454
|
current_tasks = loop.run_until_complete(_snapshot_tasks())
|
|
1955
2455
|
except RuntimeError:
|
|
1956
2456
|
# this might happen if the loop is already closing
|
|
1957
|
-
|
|
1958
|
-
return
|
|
2457
|
+
pass
|
|
1959
2458
|
|
|
1960
2459
|
leftovers = [
|
|
1961
2460
|
t
|
|
@@ -1965,33 +2464,32 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1965
2464
|
and not t.done()
|
|
1966
2465
|
]
|
|
1967
2466
|
|
|
1968
|
-
if not leftovers:
|
|
1969
|
-
return
|
|
1970
|
-
|
|
1971
2467
|
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
2468
|
+
if len(leftovers) > 0:
|
|
2469
|
+
logger.warning(
|
|
2470
|
+
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
2471
|
+
len(leftovers),
|
|
2472
|
+
)
|
|
1976
2473
|
for t in leftovers:
|
|
1977
2474
|
meta = task_meta.get(t, {})
|
|
1978
2475
|
name = t.get_name()
|
|
1979
2476
|
logger.warning(" - STRAY %s meta=%s", name, meta)
|
|
1980
2477
|
|
|
1981
|
-
|
|
1982
|
-
t
|
|
2478
|
+
if leftovers:
|
|
2479
|
+
for t in leftovers:
|
|
2480
|
+
t.cancel()
|
|
1983
2481
|
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
)
|
|
1989
|
-
except RuntimeError:
|
|
1990
|
-
# If the loop is closing here, just continue
|
|
1991
|
-
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1992
|
-
logger.warning(
|
|
1993
|
-
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
2482
|
+
# Drain strays so they don’t leak into the next iteration
|
|
2483
|
+
try:
|
|
2484
|
+
loop.run_until_complete(
|
|
2485
|
+
asyncio.gather(*leftovers, return_exceptions=True)
|
|
1994
2486
|
)
|
|
2487
|
+
except RuntimeError:
|
|
2488
|
+
# If the loop is closing here, just continue
|
|
2489
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
2490
|
+
logger.warning(
|
|
2491
|
+
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
2492
|
+
)
|
|
1995
2493
|
|
|
1996
2494
|
# Evaluate traces
|
|
1997
2495
|
if trace_manager.traces_to_evaluate:
|
|
@@ -2014,25 +2512,6 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2014
2512
|
pbar_id=pbar_id,
|
|
2015
2513
|
)
|
|
2016
2514
|
)
|
|
2017
|
-
elif openai_test_case_pairs:
|
|
2018
|
-
loop.run_until_complete(
|
|
2019
|
-
_evaluate_test_case_pairs(
|
|
2020
|
-
test_case_pairs=openai_test_case_pairs,
|
|
2021
|
-
test_run=test_run,
|
|
2022
|
-
test_run_manager=test_run_manager,
|
|
2023
|
-
test_results=test_results,
|
|
2024
|
-
ignore_errors=error_config.ignore_errors,
|
|
2025
|
-
skip_on_missing_params=error_config.skip_on_missing_params,
|
|
2026
|
-
show_indicator=display_config.show_indicator,
|
|
2027
|
-
verbose_mode=display_config.verbose_mode,
|
|
2028
|
-
throttle_value=async_config.throttle_value,
|
|
2029
|
-
max_concurrent=async_config.max_concurrent,
|
|
2030
|
-
_use_bar_indicator=_use_bar_indicator,
|
|
2031
|
-
_is_assert_test=_is_assert_test,
|
|
2032
|
-
progress=progress,
|
|
2033
|
-
pbar_id=pbar_id,
|
|
2034
|
-
)
|
|
2035
|
-
)
|
|
2036
2515
|
elif trace_manager.integration_traces_to_evaluate:
|
|
2037
2516
|
loop.run_until_complete(
|
|
2038
2517
|
_a_evaluate_traces(
|
|
@@ -2106,6 +2585,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2106
2585
|
local_trace_manager.evaluating = False
|
|
2107
2586
|
local_trace_manager.traces_to_evaluate_order.clear()
|
|
2108
2587
|
local_trace_manager.traces_to_evaluate.clear()
|
|
2588
|
+
local_trace_manager.trace_uuid_to_golden.clear()
|
|
2109
2589
|
|
|
2110
2590
|
|
|
2111
2591
|
async def _a_evaluate_traces(
|
|
@@ -2129,11 +2609,32 @@ async def _a_evaluate_traces(
|
|
|
2129
2609
|
|
|
2130
2610
|
async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
|
|
2131
2611
|
async with semaphore:
|
|
2132
|
-
return await
|
|
2612
|
+
return await asyncio.wait_for(
|
|
2613
|
+
func(*args, **kwargs),
|
|
2614
|
+
timeout=_per_task_timeout(),
|
|
2615
|
+
)
|
|
2133
2616
|
|
|
2134
2617
|
eval_tasks = []
|
|
2135
|
-
|
|
2136
|
-
|
|
2618
|
+
# Here, we will work off a fixed-set copy to avoid surprises from potential
|
|
2619
|
+
# mid-iteration mutation
|
|
2620
|
+
traces_snapshot = list(traces_to_evaluate or [])
|
|
2621
|
+
|
|
2622
|
+
for count, trace in enumerate(traces_snapshot):
|
|
2623
|
+
# Prefer the explicit mapping from trace -> golden captured at trace creation.
|
|
2624
|
+
golden = trace_manager.trace_uuid_to_golden.get(trace.uuid)
|
|
2625
|
+
if not golden:
|
|
2626
|
+
# trace started during evaluation_loop but the CURRENT_GOLDEN was
|
|
2627
|
+
# not set for some reason. We can’t map it to a golden, so the best
|
|
2628
|
+
# we can do is skip evaluation for this trace.
|
|
2629
|
+
if (
|
|
2630
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
2631
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
2632
|
+
):
|
|
2633
|
+
logger.debug(
|
|
2634
|
+
"Skipping trace %s: no golden association found during evaluation_loop ",
|
|
2635
|
+
trace.uuid,
|
|
2636
|
+
)
|
|
2637
|
+
continue
|
|
2137
2638
|
with capture_evaluation_run("golden"):
|
|
2138
2639
|
task = execute_evals_with_semaphore(
|
|
2139
2640
|
func=_a_execute_agentic_test_case,
|
|
@@ -2154,7 +2655,18 @@ async def _a_evaluate_traces(
|
|
|
2154
2655
|
)
|
|
2155
2656
|
eval_tasks.append(asyncio.create_task(task))
|
|
2156
2657
|
await asyncio.sleep(throttle_value)
|
|
2157
|
-
|
|
2658
|
+
|
|
2659
|
+
try:
|
|
2660
|
+
await asyncio.wait_for(
|
|
2661
|
+
asyncio.gather(*eval_tasks),
|
|
2662
|
+
timeout=_gather_timeout(),
|
|
2663
|
+
)
|
|
2664
|
+
except asyncio.TimeoutError:
|
|
2665
|
+
for t in eval_tasks:
|
|
2666
|
+
if not t.done():
|
|
2667
|
+
t.cancel()
|
|
2668
|
+
await asyncio.gather(*eval_tasks, return_exceptions=True)
|
|
2669
|
+
raise
|
|
2158
2670
|
|
|
2159
2671
|
|
|
2160
2672
|
async def _evaluate_test_case_pairs(
|
|
@@ -2177,7 +2689,10 @@ async def _evaluate_test_case_pairs(
|
|
|
2177
2689
|
|
|
2178
2690
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
2179
2691
|
async with semaphore:
|
|
2180
|
-
return await
|
|
2692
|
+
return await asyncio.wait_for(
|
|
2693
|
+
func(*args, **kwargs),
|
|
2694
|
+
timeout=_per_task_timeout(),
|
|
2695
|
+
)
|
|
2181
2696
|
|
|
2182
2697
|
tasks = []
|
|
2183
2698
|
for count, test_case_pair in enumerate(test_case_pairs):
|
|
@@ -2210,7 +2725,19 @@ async def _evaluate_test_case_pairs(
|
|
|
2210
2725
|
)
|
|
2211
2726
|
tasks.append(asyncio.create_task(task))
|
|
2212
2727
|
await asyncio.sleep(throttle_value)
|
|
2213
|
-
|
|
2728
|
+
|
|
2729
|
+
try:
|
|
2730
|
+
await asyncio.wait_for(
|
|
2731
|
+
asyncio.gather(*tasks),
|
|
2732
|
+
timeout=_gather_timeout(),
|
|
2733
|
+
)
|
|
2734
|
+
except asyncio.TimeoutError:
|
|
2735
|
+
# Cancel any still-pending tasks and drain them
|
|
2736
|
+
for t in tasks:
|
|
2737
|
+
if not t.done():
|
|
2738
|
+
t.cancel()
|
|
2739
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
2740
|
+
raise
|
|
2214
2741
|
|
|
2215
2742
|
|
|
2216
2743
|
def _execute_metric(
|
|
@@ -2225,13 +2752,14 @@ def _execute_metric(
|
|
|
2225
2752
|
test_case,
|
|
2226
2753
|
_show_indicator=show_metric_indicator,
|
|
2227
2754
|
_in_component=in_component,
|
|
2755
|
+
_log_metric_to_confident=False,
|
|
2228
2756
|
)
|
|
2229
2757
|
except MissingTestCaseParamsError as e:
|
|
2230
2758
|
if error_config.skip_on_missing_params:
|
|
2231
2759
|
return "skip"
|
|
2232
2760
|
else:
|
|
2233
2761
|
if error_config.ignore_errors:
|
|
2234
|
-
metric.error =
|
|
2762
|
+
metric.error = format_error_text(e)
|
|
2235
2763
|
metric.success = False
|
|
2236
2764
|
else:
|
|
2237
2765
|
raise
|
|
@@ -2243,19 +2771,54 @@ def _execute_metric(
|
|
|
2243
2771
|
return "skip"
|
|
2244
2772
|
else:
|
|
2245
2773
|
if error_config.ignore_errors:
|
|
2246
|
-
metric.error =
|
|
2774
|
+
metric.error = format_error_text(e)
|
|
2247
2775
|
metric.success = False
|
|
2248
2776
|
else:
|
|
2249
2777
|
raise
|
|
2250
2778
|
except Exception as e:
|
|
2251
2779
|
if error_config.ignore_errors:
|
|
2252
|
-
metric.error =
|
|
2780
|
+
metric.error = format_error_text(e)
|
|
2253
2781
|
metric.success = False
|
|
2254
2782
|
else:
|
|
2255
2783
|
raise
|
|
2256
2784
|
except Exception as e:
|
|
2257
2785
|
if error_config.ignore_errors:
|
|
2258
|
-
metric.error =
|
|
2786
|
+
metric.error = format_error_text(e)
|
|
2259
2787
|
metric.success = False
|
|
2260
2788
|
else:
|
|
2261
2789
|
raise
|
|
2790
|
+
|
|
2791
|
+
|
|
2792
|
+
def log_prompt(
|
|
2793
|
+
llm_span: LlmSpan,
|
|
2794
|
+
test_run_manager: TestRunManager,
|
|
2795
|
+
):
|
|
2796
|
+
prompt = llm_span.prompt
|
|
2797
|
+
if prompt is None:
|
|
2798
|
+
return
|
|
2799
|
+
|
|
2800
|
+
span_hyperparameters = {}
|
|
2801
|
+
prompt_version = prompt.version if is_confident() else None
|
|
2802
|
+
key = f"{prompt.alias}_{prompt_version}"
|
|
2803
|
+
span_hyperparameters[key] = prompt
|
|
2804
|
+
|
|
2805
|
+
test_run = test_run_manager.get_test_run()
|
|
2806
|
+
if test_run.prompts is None:
|
|
2807
|
+
test_run.prompts = []
|
|
2808
|
+
if test_run.hyperparameters is None:
|
|
2809
|
+
test_run.hyperparameters = {}
|
|
2810
|
+
|
|
2811
|
+
if key not in test_run.hyperparameters:
|
|
2812
|
+
test_run.hyperparameters.update(
|
|
2813
|
+
process_hyperparameters(span_hyperparameters, False)
|
|
2814
|
+
)
|
|
2815
|
+
existing_prompt_keys = {
|
|
2816
|
+
f"{p.alias}_{p.version}" for p in test_run.prompts
|
|
2817
|
+
}
|
|
2818
|
+
new_prompts = process_prompts(span_hyperparameters)
|
|
2819
|
+
for new_prompt in new_prompts:
|
|
2820
|
+
new_prompt_key = f"{new_prompt.alias}_{new_prompt.version}"
|
|
2821
|
+
if new_prompt_key not in existing_prompt_keys:
|
|
2822
|
+
test_run.prompts.append(new_prompt)
|
|
2823
|
+
|
|
2824
|
+
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|