deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +258 -47
- deepeval/config/settings_manager.py +4 -0
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/evaluate/execute.py +1099 -633
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +5 -4
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +4 -3
- deepeval/test_run/test_run.py +24 -5
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +10 -1
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +44 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
deepeval/evaluate/execute.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
|
|
3
4
|
from rich.progress import (
|
|
@@ -56,10 +57,16 @@ from deepeval.metrics import (
|
|
|
56
57
|
BaseMetric,
|
|
57
58
|
BaseConversationalMetric,
|
|
58
59
|
BaseMultimodalMetric,
|
|
60
|
+
TaskCompletionMetric,
|
|
59
61
|
)
|
|
60
62
|
from deepeval.metrics.indicator import (
|
|
61
63
|
measure_metrics_with_indicator,
|
|
62
64
|
)
|
|
65
|
+
from deepeval.models.retry_policy import (
|
|
66
|
+
set_outer_deadline,
|
|
67
|
+
reset_outer_deadline,
|
|
68
|
+
run_sync_with_timeout,
|
|
69
|
+
)
|
|
63
70
|
from deepeval.test_case import (
|
|
64
71
|
LLMTestCase,
|
|
65
72
|
ConversationalTestCase,
|
|
@@ -238,6 +245,18 @@ def filter_duplicate_results(
|
|
|
238
245
|
]
|
|
239
246
|
|
|
240
247
|
|
|
248
|
+
async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
|
|
249
|
+
token = set_outer_deadline(timeout)
|
|
250
|
+
try:
|
|
251
|
+
if inspect.isawaitable(obj):
|
|
252
|
+
coro = obj
|
|
253
|
+
else:
|
|
254
|
+
coro = obj(*args, **kwargs)
|
|
255
|
+
return await asyncio.wait_for(coro, timeout=timeout)
|
|
256
|
+
finally:
|
|
257
|
+
reset_outer_deadline(token)
|
|
258
|
+
|
|
259
|
+
|
|
241
260
|
###########################################
|
|
242
261
|
### E2E Evals #############################
|
|
243
262
|
###########################################
|
|
@@ -269,6 +288,13 @@ def execute_test_cases(
|
|
|
269
288
|
|
|
270
289
|
test_run_manager.save_to_disk = cache_config.write_cache
|
|
271
290
|
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
291
|
+
if test_run is None:
|
|
292
|
+
# ensure we have a test_run ( in case it couldn't be loaded from disk )
|
|
293
|
+
test_run_manager.create_test_run(identifier=identifier)
|
|
294
|
+
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
295
|
+
|
|
296
|
+
# capture once for inner closures
|
|
297
|
+
hyperparameters = test_run.hyperparameters if test_run is not None else None
|
|
272
298
|
|
|
273
299
|
if display_config.verbose_mode is not None:
|
|
274
300
|
for metric in metrics:
|
|
@@ -289,176 +315,228 @@ def execute_test_cases(
|
|
|
289
315
|
test_results: List[TestResult] = []
|
|
290
316
|
|
|
291
317
|
def evaluate_test_cases(
|
|
292
|
-
progress: Optional[Progress] = None, pbar_id: Optional[
|
|
318
|
+
progress: Optional[Progress] = None, pbar_id: Optional[int] = None
|
|
293
319
|
):
|
|
294
320
|
llm_test_case_count = -1
|
|
321
|
+
mllm_test_case_count = -1
|
|
295
322
|
conversational_test_case_count = -1
|
|
296
323
|
show_metric_indicator = (
|
|
297
324
|
display_config.show_indicator and not _use_bar_indicator
|
|
298
325
|
)
|
|
299
326
|
for i, test_case in enumerate(test_cases):
|
|
327
|
+
# skip what we know we won't run
|
|
328
|
+
if isinstance(test_case, LLMTestCase):
|
|
329
|
+
if not llm_metrics:
|
|
330
|
+
update_pbar(progress, pbar_id)
|
|
331
|
+
continue
|
|
332
|
+
per_case_total = len(llm_metrics)
|
|
333
|
+
elif isinstance(test_case, MLLMTestCase):
|
|
334
|
+
if not mllm_metrics:
|
|
335
|
+
update_pbar(progress, pbar_id)
|
|
336
|
+
continue
|
|
337
|
+
per_case_total = len(mllm_metrics)
|
|
338
|
+
elif isinstance(test_case, ConversationalTestCase):
|
|
339
|
+
if not conversational_metrics:
|
|
340
|
+
update_pbar(progress, pbar_id)
|
|
341
|
+
continue
|
|
342
|
+
per_case_total = len(conversational_metrics)
|
|
343
|
+
|
|
300
344
|
pbar_test_case_id = add_pbar(
|
|
301
345
|
progress,
|
|
302
346
|
f" 🎯 Evaluating test case #{i}",
|
|
303
|
-
total=
|
|
347
|
+
total=per_case_total,
|
|
304
348
|
)
|
|
305
|
-
with capture_evaluation_run("test case"):
|
|
306
|
-
for metric in metrics:
|
|
307
|
-
metric.error = None # Reset metric error
|
|
308
349
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
350
|
+
metrics_for_case = (
|
|
351
|
+
llm_metrics
|
|
352
|
+
if isinstance(test_case, LLMTestCase)
|
|
353
|
+
else (
|
|
354
|
+
mllm_metrics
|
|
355
|
+
if isinstance(test_case, MLLMTestCase)
|
|
356
|
+
else conversational_metrics
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
api_test_case = create_api_test_case(
|
|
360
|
+
test_case=test_case,
|
|
361
|
+
index=(
|
|
362
|
+
llm_test_case_count + 1
|
|
363
|
+
if isinstance(test_case, LLMTestCase)
|
|
364
|
+
else (
|
|
365
|
+
mllm_test_case_count + 1
|
|
366
|
+
if isinstance(test_case, MLLMTestCase)
|
|
367
|
+
else conversational_test_case_count + 1
|
|
325
368
|
)
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
if cached_metric_data:
|
|
337
|
-
metric_data = cached_metric_data.metric_data
|
|
338
|
-
|
|
339
|
-
if metric_data is None:
|
|
340
|
-
read_all_metrics_from_cache = False
|
|
341
|
-
res = _execute_metric(
|
|
342
|
-
metric=metric,
|
|
343
|
-
test_case=test_case,
|
|
344
|
-
show_metric_indicator=show_metric_indicator,
|
|
345
|
-
in_component=False,
|
|
346
|
-
error_config=error_config,
|
|
347
|
-
)
|
|
348
|
-
if res == "skip":
|
|
349
|
-
continue
|
|
350
|
-
metric_data = create_metric_data(metric)
|
|
351
|
-
|
|
352
|
-
# here, we will check for an additional property on the flattened test cases to see if updating is necessary
|
|
353
|
-
api_test_case.update_metric_data(metric_data)
|
|
354
|
-
if metric.error is None:
|
|
355
|
-
cache_metric_data = deepcopy(metric_data)
|
|
356
|
-
cache_metric_data.evaluation_cost = 0 # Cached metrics will have evaluation cost as 0, not None.
|
|
357
|
-
updated_cached_metric_data = CachedMetricData(
|
|
358
|
-
metric_data=cache_metric_data,
|
|
359
|
-
metric_configuration=Cache.create_metric_configuration(
|
|
360
|
-
metric
|
|
361
|
-
),
|
|
362
|
-
)
|
|
363
|
-
new_cached_test_case.cached_metrics_data.append(
|
|
364
|
-
updated_cached_metric_data
|
|
365
|
-
)
|
|
366
|
-
update_pbar(progress, pbar_test_case_id)
|
|
369
|
+
),
|
|
370
|
+
)
|
|
371
|
+
emitted = [False] * len(metrics_for_case)
|
|
372
|
+
index_of = {id(m): i for i, m in enumerate(metrics_for_case)}
|
|
373
|
+
current_index = -1
|
|
374
|
+
start_time = time.perf_counter()
|
|
375
|
+
deadline_timeout = _per_task_timeout()
|
|
376
|
+
deadline_token = set_outer_deadline(deadline_timeout)
|
|
377
|
+
new_cached_test_case: CachedTestCase = None
|
|
378
|
+
try:
|
|
367
379
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
380
|
+
def _run_case():
|
|
381
|
+
nonlocal new_cached_test_case, current_index, llm_test_case_count, mllm_test_case_count, conversational_test_case_count
|
|
382
|
+
with capture_evaluation_run("test case"):
|
|
383
|
+
for metric in metrics:
|
|
384
|
+
metric.error = None # Reset metric error
|
|
385
|
+
|
|
386
|
+
if isinstance(test_case, LLMTestCase):
|
|
387
|
+
llm_test_case_count += 1
|
|
388
|
+
cached_test_case = None
|
|
389
|
+
if cache_config.use_cache:
|
|
390
|
+
cached_test_case = global_test_run_cache_manager.get_cached_test_case(
|
|
391
|
+
test_case, hyperparameters
|
|
392
|
+
)
|
|
374
393
|
|
|
375
|
-
|
|
376
|
-
|
|
394
|
+
##### Metric Calculation #####
|
|
395
|
+
new_cached_test_case = CachedTestCase()
|
|
377
396
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
)
|
|
397
|
+
for metric in llm_metrics:
|
|
398
|
+
current_index = index_of[id(metric)]
|
|
399
|
+
metric_data = None
|
|
400
|
+
if cached_test_case is not None:
|
|
401
|
+
cached_metric_data = Cache.get_metric_data(
|
|
402
|
+
metric, cached_test_case
|
|
403
|
+
)
|
|
404
|
+
if cached_metric_data:
|
|
405
|
+
metric_data = (
|
|
406
|
+
cached_metric_data.metric_data
|
|
407
|
+
)
|
|
390
408
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
409
|
+
if metric_data is None:
|
|
410
|
+
res = _execute_metric(
|
|
411
|
+
metric=metric,
|
|
412
|
+
test_case=test_case,
|
|
413
|
+
show_metric_indicator=show_metric_indicator,
|
|
414
|
+
in_component=False,
|
|
415
|
+
error_config=error_config,
|
|
416
|
+
)
|
|
417
|
+
if res == "skip":
|
|
418
|
+
continue
|
|
419
|
+
metric_data = create_metric_data(metric)
|
|
395
420
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
421
|
+
# here, we will check for an additional property on the flattened test cases to see if updating is necessary
|
|
422
|
+
api_test_case.update_metric_data(metric_data)
|
|
423
|
+
emitted[current_index] = True
|
|
424
|
+
if metric.error is None:
|
|
425
|
+
cache_metric_data = deepcopy(metric_data)
|
|
426
|
+
cache_metric_data.evaluation_cost = 0 # Cached metrics will have evaluation cost as 0, not None.
|
|
427
|
+
updated_cached_metric_data = CachedMetricData(
|
|
428
|
+
metric_data=cache_metric_data,
|
|
429
|
+
metric_configuration=Cache.create_metric_configuration(
|
|
430
|
+
metric
|
|
431
|
+
),
|
|
432
|
+
)
|
|
433
|
+
new_cached_test_case.cached_metrics_data.append(
|
|
434
|
+
updated_cached_metric_data
|
|
435
|
+
)
|
|
436
|
+
update_pbar(progress, pbar_test_case_id)
|
|
410
437
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
438
|
+
# No caching and not sending test cases to Confident AI for multimodal metrics yet
|
|
439
|
+
elif isinstance(test_case, MLLMTestCase):
|
|
440
|
+
mllm_test_case_count += 1
|
|
441
|
+
for metric in mllm_metrics:
|
|
442
|
+
current_index = index_of[id(metric)]
|
|
443
|
+
res = _execute_metric(
|
|
444
|
+
metric=metric,
|
|
445
|
+
test_case=test_case,
|
|
446
|
+
show_metric_indicator=show_metric_indicator,
|
|
447
|
+
in_component=False,
|
|
448
|
+
error_config=error_config,
|
|
449
|
+
)
|
|
450
|
+
if res == "skip":
|
|
451
|
+
continue
|
|
414
452
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
453
|
+
metric_data = create_metric_data(metric)
|
|
454
|
+
api_test_case.update_metric_data(metric_data)
|
|
455
|
+
emitted[current_index] = True
|
|
456
|
+
update_pbar(progress, pbar_test_case_id)
|
|
419
457
|
|
|
420
|
-
|
|
421
|
-
|
|
458
|
+
# No caching for conversational metrics yet
|
|
459
|
+
elif isinstance(test_case, ConversationalTestCase):
|
|
460
|
+
conversational_test_case_count += 1
|
|
461
|
+
for metric in conversational_metrics:
|
|
462
|
+
current_index = index_of[id(metric)]
|
|
463
|
+
res = _execute_metric(
|
|
464
|
+
metric=metric,
|
|
465
|
+
test_case=test_case,
|
|
466
|
+
show_metric_indicator=show_metric_indicator,
|
|
467
|
+
in_component=False,
|
|
468
|
+
error_config=error_config,
|
|
469
|
+
)
|
|
470
|
+
if res == "skip":
|
|
471
|
+
continue
|
|
422
472
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
473
|
+
metric_data = create_metric_data(metric)
|
|
474
|
+
api_test_case.update_metric_data(metric_data)
|
|
475
|
+
emitted[current_index] = True
|
|
476
|
+
update_pbar(progress, pbar_test_case_id)
|
|
477
|
+
|
|
478
|
+
run_sync_with_timeout(_run_case, deadline_timeout)
|
|
479
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
480
|
+
msg = (
|
|
481
|
+
f"Timed out after {deadline_timeout:.2f}s while evaluating metric. "
|
|
482
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
483
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
484
|
+
)
|
|
485
|
+
for i, m in enumerate(metrics_for_case):
|
|
486
|
+
if getattr(m, "skipped", False):
|
|
487
|
+
continue
|
|
488
|
+
# already finished or errored? leave it
|
|
489
|
+
if getattr(m, "success", None) is not None or getattr(
|
|
490
|
+
m, "error", None
|
|
491
|
+
):
|
|
426
492
|
continue
|
|
493
|
+
if i == current_index:
|
|
494
|
+
m.success = False
|
|
495
|
+
m.error = msg
|
|
496
|
+
elif i > current_index:
|
|
497
|
+
m.success = False
|
|
498
|
+
m.error = "Skipped due to case timeout."
|
|
499
|
+
|
|
500
|
+
if not error_config.ignore_errors:
|
|
501
|
+
raise
|
|
427
502
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
503
|
+
finally:
|
|
504
|
+
try:
|
|
505
|
+
if (
|
|
506
|
+
isinstance(test_case, LLMTestCase)
|
|
507
|
+
and new_cached_test_case is not None
|
|
508
|
+
):
|
|
509
|
+
### Cache Test Run ###
|
|
510
|
+
global_test_run_cache_manager.cache_test_case(
|
|
511
|
+
test_case,
|
|
512
|
+
new_cached_test_case,
|
|
513
|
+
hyperparameters,
|
|
433
514
|
)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
metric=metric,
|
|
440
|
-
test_case=test_case,
|
|
441
|
-
show_metric_indicator=show_metric_indicator,
|
|
442
|
-
in_component=False,
|
|
443
|
-
error_config=error_config,
|
|
515
|
+
global_test_run_cache_manager.cache_test_case(
|
|
516
|
+
test_case,
|
|
517
|
+
new_cached_test_case,
|
|
518
|
+
hyperparameters,
|
|
519
|
+
to_temp=True,
|
|
444
520
|
)
|
|
445
|
-
if res == "skip":
|
|
446
|
-
continue
|
|
447
|
-
|
|
448
|
-
metric_data = create_metric_data(metric)
|
|
449
|
-
api_test_case.update_metric_data(metric_data)
|
|
450
|
-
update_pbar(progress, pbar_test_case_id)
|
|
451
521
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
522
|
+
# Attach MetricData for *all* metrics (finished or synthesized)
|
|
523
|
+
for i, m in enumerate(metrics_for_case):
|
|
524
|
+
if getattr(m, "skipped", False):
|
|
525
|
+
continue
|
|
526
|
+
if not emitted[i]:
|
|
527
|
+
api_test_case.update_metric_data(
|
|
528
|
+
create_metric_data(m)
|
|
529
|
+
)
|
|
455
530
|
|
|
456
|
-
|
|
531
|
+
elapsed = time.perf_counter() - start_time
|
|
532
|
+
api_test_case.update_run_duration(
|
|
533
|
+
elapsed if elapsed >= 0 else deadline_timeout
|
|
534
|
+
)
|
|
457
535
|
test_run_manager.update_test_run(api_test_case, test_case)
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
536
|
+
test_results.append(create_test_result(api_test_case))
|
|
537
|
+
update_pbar(progress, pbar_id)
|
|
538
|
+
finally:
|
|
539
|
+
reset_outer_deadline(deadline_token)
|
|
462
540
|
|
|
463
541
|
if display_config.show_indicator and _use_bar_indicator:
|
|
464
542
|
progress = Progress(
|
|
@@ -503,9 +581,9 @@ async def a_execute_test_cases(
|
|
|
503
581
|
|
|
504
582
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
505
583
|
async with semaphore:
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
timeout=
|
|
584
|
+
timeout = _per_task_timeout()
|
|
585
|
+
return await _await_with_outer_deadline(
|
|
586
|
+
func, *args, timeout=timeout, **kwargs
|
|
509
587
|
)
|
|
510
588
|
|
|
511
589
|
global_test_run_cache_manager.disable_write_cache = (
|
|
@@ -609,7 +687,7 @@ async def a_execute_test_cases(
|
|
|
609
687
|
|
|
610
688
|
task = execute_with_semaphore(
|
|
611
689
|
func=_a_execute_conversational_test_cases,
|
|
612
|
-
metrics=copy_metrics(
|
|
690
|
+
metrics=copy_metrics(conversational_metrics),
|
|
613
691
|
test_case=test_case,
|
|
614
692
|
test_run_manager=test_run_manager,
|
|
615
693
|
test_results=test_results,
|
|
@@ -631,13 +709,15 @@ async def a_execute_test_cases(
|
|
|
631
709
|
asyncio.gather(*tasks),
|
|
632
710
|
timeout=_gather_timeout(),
|
|
633
711
|
)
|
|
634
|
-
except asyncio.TimeoutError:
|
|
635
|
-
# Cancel any still-pending tasks and drain them
|
|
712
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
636
713
|
for t in tasks:
|
|
637
714
|
if not t.done():
|
|
638
715
|
t.cancel()
|
|
639
716
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
640
|
-
|
|
717
|
+
logging.getLogger("deepeval").error(
|
|
718
|
+
"Gather timed out after %.1fs. Some metrics may be marked as timed out.",
|
|
719
|
+
_gather_timeout(),
|
|
720
|
+
)
|
|
641
721
|
|
|
642
722
|
else:
|
|
643
723
|
for test_case in test_cases:
|
|
@@ -717,7 +797,7 @@ async def a_execute_test_cases(
|
|
|
717
797
|
asyncio.gather(*tasks),
|
|
718
798
|
timeout=_gather_timeout(),
|
|
719
799
|
)
|
|
720
|
-
except asyncio.TimeoutError:
|
|
800
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
721
801
|
# Cancel any still-pending tasks and drain them
|
|
722
802
|
for t in tasks:
|
|
723
803
|
if not t.done():
|
|
@@ -744,6 +824,7 @@ async def _a_execute_llm_test_cases(
|
|
|
744
824
|
progress: Optional[Progress] = None,
|
|
745
825
|
pbar_id: Optional[int] = None,
|
|
746
826
|
):
|
|
827
|
+
logger.info("in _a_execute_llm_test_cases")
|
|
747
828
|
pbar_test_case_id = add_pbar(
|
|
748
829
|
progress,
|
|
749
830
|
f" 🎯 Evaluating test case #{count}",
|
|
@@ -767,64 +848,85 @@ async def _a_execute_llm_test_cases(
|
|
|
767
848
|
api_test_case = create_api_test_case(
|
|
768
849
|
test_case=test_case, index=count if not _is_assert_test else None
|
|
769
850
|
)
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
metrics=metrics,
|
|
774
|
-
test_case=test_case,
|
|
775
|
-
cached_test_case=cached_test_case,
|
|
776
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
777
|
-
ignore_errors=ignore_errors,
|
|
778
|
-
show_indicator=show_metrics_indicator,
|
|
779
|
-
pbar_eval_id=pbar_test_case_id,
|
|
780
|
-
progress=progress,
|
|
781
|
-
)
|
|
851
|
+
try:
|
|
852
|
+
new_cached_test_case: CachedTestCase = CachedTestCase()
|
|
853
|
+
test_start_time = time.perf_counter()
|
|
782
854
|
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
855
|
+
await measure_metrics_with_indicator(
|
|
856
|
+
metrics=metrics,
|
|
857
|
+
test_case=test_case,
|
|
858
|
+
cached_test_case=cached_test_case,
|
|
859
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
860
|
+
ignore_errors=ignore_errors,
|
|
861
|
+
show_indicator=show_metrics_indicator,
|
|
862
|
+
pbar_eval_id=pbar_test_case_id,
|
|
863
|
+
progress=progress,
|
|
864
|
+
)
|
|
865
|
+
except asyncio.CancelledError:
|
|
866
|
+
msg = (
|
|
867
|
+
"Timed out/cancelled while evaluating metric. "
|
|
868
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
869
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
870
|
+
)
|
|
871
|
+
for m in metrics:
|
|
872
|
+
if getattr(m, "skipped", False):
|
|
873
|
+
continue
|
|
874
|
+
# If the task never finished and didn't set a terminal state, mark it now
|
|
875
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
876
|
+
m, "error", None
|
|
877
|
+
):
|
|
878
|
+
m.success = False
|
|
879
|
+
m.error = msg
|
|
880
|
+
if not ignore_errors:
|
|
881
|
+
raise
|
|
882
|
+
finally:
|
|
883
|
+
for metric in metrics:
|
|
884
|
+
if metric.skipped:
|
|
885
|
+
continue
|
|
786
886
|
|
|
787
|
-
|
|
788
|
-
|
|
887
|
+
metric_data = create_metric_data(metric)
|
|
888
|
+
api_test_case.update_metric_data(metric_data)
|
|
789
889
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
890
|
+
if metric.error is None:
|
|
891
|
+
cache_metric_data = deepcopy(metric_data)
|
|
892
|
+
cache_metric_data.evaluation_cost = (
|
|
893
|
+
0 # Create new copy and save 0 for cost
|
|
894
|
+
)
|
|
895
|
+
updated_cached_metric_data = CachedMetricData(
|
|
896
|
+
metric_data=cache_metric_data,
|
|
897
|
+
metric_configuration=Cache.create_metric_configuration(
|
|
898
|
+
metric
|
|
899
|
+
),
|
|
900
|
+
)
|
|
901
|
+
new_cached_test_case.cached_metrics_data.append(
|
|
902
|
+
updated_cached_metric_data
|
|
903
|
+
)
|
|
802
904
|
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
### Cache Test Run ###
|
|
814
|
-
global_test_run_cache_manager.cache_test_case(
|
|
815
|
-
test_case,
|
|
816
|
-
new_cached_test_case,
|
|
817
|
-
test_run.hyperparameters,
|
|
818
|
-
)
|
|
819
|
-
global_test_run_cache_manager.cache_test_case(
|
|
820
|
-
test_case,
|
|
821
|
-
new_cached_test_case,
|
|
822
|
-
test_run.hyperparameters,
|
|
823
|
-
to_temp=True,
|
|
824
|
-
)
|
|
905
|
+
test_end_time = time.perf_counter()
|
|
906
|
+
run_duration = test_end_time - test_start_time
|
|
907
|
+
# Quick hack to check if all metrics were from cache
|
|
908
|
+
if run_duration < 1:
|
|
909
|
+
run_duration = 0
|
|
910
|
+
api_test_case.update_run_duration(run_duration)
|
|
911
|
+
|
|
912
|
+
### Update Test Run ###
|
|
913
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
825
914
|
|
|
826
|
-
|
|
827
|
-
|
|
915
|
+
### Cache Test Run ###
|
|
916
|
+
global_test_run_cache_manager.cache_test_case(
|
|
917
|
+
test_case,
|
|
918
|
+
new_cached_test_case,
|
|
919
|
+
test_run.hyperparameters,
|
|
920
|
+
)
|
|
921
|
+
global_test_run_cache_manager.cache_test_case(
|
|
922
|
+
test_case,
|
|
923
|
+
new_cached_test_case,
|
|
924
|
+
test_run.hyperparameters,
|
|
925
|
+
to_temp=True,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
test_results.append(create_test_result(api_test_case))
|
|
929
|
+
update_pbar(progress, pbar_id)
|
|
828
930
|
|
|
829
931
|
|
|
830
932
|
async def _a_execute_mllm_test_cases(
|
|
@@ -856,31 +958,50 @@ async def _a_execute_mllm_test_cases(
|
|
|
856
958
|
test_case=test_case, index=count if not _is_assert_test else None
|
|
857
959
|
)
|
|
858
960
|
test_start_time = time.perf_counter()
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
961
|
+
try:
|
|
962
|
+
await measure_metrics_with_indicator(
|
|
963
|
+
metrics=metrics,
|
|
964
|
+
test_case=test_case,
|
|
965
|
+
cached_test_case=None,
|
|
966
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
967
|
+
ignore_errors=ignore_errors,
|
|
968
|
+
show_indicator=show_metrics_indicator,
|
|
969
|
+
pbar_eval_id=pbar_test_case_id,
|
|
970
|
+
progress=progress,
|
|
971
|
+
)
|
|
972
|
+
except asyncio.CancelledError:
|
|
973
|
+
msg = (
|
|
974
|
+
"Timed out/cancelled while evaluating metric. "
|
|
975
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
976
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
977
|
+
)
|
|
978
|
+
for m in metrics:
|
|
979
|
+
if getattr(m, "skipped", False):
|
|
980
|
+
continue
|
|
981
|
+
# If the task never finished and didn't set a terminal state, mark it now
|
|
982
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
983
|
+
m, "error", None
|
|
984
|
+
):
|
|
985
|
+
m.success = False
|
|
986
|
+
m.error = msg
|
|
987
|
+
if not ignore_errors:
|
|
988
|
+
raise
|
|
989
|
+
finally:
|
|
990
|
+
for metric in metrics:
|
|
991
|
+
if metric.skipped:
|
|
992
|
+
continue
|
|
872
993
|
|
|
873
|
-
|
|
874
|
-
|
|
994
|
+
metric_data = create_metric_data(metric)
|
|
995
|
+
api_test_case.update_metric_data(metric_data)
|
|
875
996
|
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
997
|
+
test_end_time = time.perf_counter()
|
|
998
|
+
run_duration = test_end_time - test_start_time
|
|
999
|
+
api_test_case.update_run_duration(run_duration)
|
|
879
1000
|
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
1001
|
+
### Update Test Run ###
|
|
1002
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1003
|
+
test_results.append(create_test_result(api_test_case))
|
|
1004
|
+
update_pbar(progress, pbar_id)
|
|
884
1005
|
|
|
885
1006
|
|
|
886
1007
|
async def _a_execute_conversational_test_cases(
|
|
@@ -915,33 +1036,55 @@ async def _a_execute_conversational_test_cases(
|
|
|
915
1036
|
)
|
|
916
1037
|
|
|
917
1038
|
test_start_time = time.perf_counter()
|
|
918
|
-
await measure_metrics_with_indicator(
|
|
919
|
-
metrics=metrics,
|
|
920
|
-
test_case=test_case,
|
|
921
|
-
cached_test_case=None,
|
|
922
|
-
skip_on_missing_params=skip_on_missing_params,
|
|
923
|
-
ignore_errors=ignore_errors,
|
|
924
|
-
show_indicator=show_metrics_indicator,
|
|
925
|
-
pbar_eval_id=pbar_test_case_id,
|
|
926
|
-
progress=progress,
|
|
927
|
-
)
|
|
928
|
-
for metric in metrics:
|
|
929
|
-
if metric.skipped:
|
|
930
|
-
continue
|
|
931
1039
|
|
|
932
|
-
|
|
933
|
-
|
|
1040
|
+
try:
|
|
1041
|
+
await measure_metrics_with_indicator(
|
|
1042
|
+
metrics=metrics,
|
|
1043
|
+
test_case=test_case,
|
|
1044
|
+
cached_test_case=None,
|
|
1045
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
1046
|
+
ignore_errors=ignore_errors,
|
|
1047
|
+
show_indicator=show_metrics_indicator,
|
|
1048
|
+
pbar_eval_id=pbar_test_case_id,
|
|
1049
|
+
progress=progress,
|
|
1050
|
+
)
|
|
934
1051
|
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
1052
|
+
except asyncio.CancelledError:
|
|
1053
|
+
msg = (
|
|
1054
|
+
"Timed out/cancelled while evaluating metric. "
|
|
1055
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1056
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1057
|
+
)
|
|
1058
|
+
for m in metrics:
|
|
1059
|
+
if getattr(m, "skipped", False):
|
|
1060
|
+
continue
|
|
1061
|
+
# If the task never finished and didn't set a terminal state, mark it now
|
|
1062
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
1063
|
+
m, "error", None
|
|
1064
|
+
):
|
|
1065
|
+
m.success = False
|
|
1066
|
+
m.error = msg
|
|
1067
|
+
if not ignore_errors:
|
|
1068
|
+
raise
|
|
939
1069
|
|
|
940
|
-
|
|
941
|
-
|
|
1070
|
+
finally:
|
|
1071
|
+
for metric in metrics:
|
|
1072
|
+
if metric.skipped:
|
|
1073
|
+
continue
|
|
1074
|
+
|
|
1075
|
+
metric_data = create_metric_data(metric)
|
|
1076
|
+
api_test_case.update_metric_data(metric_data)
|
|
1077
|
+
|
|
1078
|
+
test_end_time = time.perf_counter()
|
|
1079
|
+
if len(metrics) > 0:
|
|
1080
|
+
run_duration = test_end_time - test_start_time
|
|
1081
|
+
api_test_case.update_run_duration(run_duration)
|
|
942
1082
|
|
|
943
|
-
|
|
944
|
-
|
|
1083
|
+
### Update Test Run ###
|
|
1084
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1085
|
+
|
|
1086
|
+
test_results.append(create_test_result(api_test_case))
|
|
1087
|
+
update_pbar(progress, pbar_id)
|
|
945
1088
|
|
|
946
1089
|
|
|
947
1090
|
###########################################
|
|
@@ -965,7 +1108,11 @@ def execute_agentic_test_cases(
|
|
|
965
1108
|
test_run_manager = global_test_run_manager
|
|
966
1109
|
|
|
967
1110
|
test_run_manager.save_to_disk = cache_config.write_cache
|
|
968
|
-
test_run_manager.get_test_run(identifier=identifier)
|
|
1111
|
+
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
1112
|
+
if test_run is None:
|
|
1113
|
+
# Create if not found
|
|
1114
|
+
test_run_manager.create_test_run(identifier=identifier)
|
|
1115
|
+
test_run = test_run_manager.get_test_run(identifier=identifier)
|
|
969
1116
|
|
|
970
1117
|
local_trace_manager = trace_manager
|
|
971
1118
|
local_trace_manager.evaluating = True
|
|
@@ -975,152 +1122,137 @@ def execute_agentic_test_cases(
|
|
|
975
1122
|
progress: Optional[Progress] = None,
|
|
976
1123
|
pbar_id: Optional[int] = None,
|
|
977
1124
|
):
|
|
978
|
-
count =
|
|
1125
|
+
count = -1
|
|
979
1126
|
show_metric_indicator = (
|
|
980
1127
|
display_config.show_indicator and not _use_bar_indicator
|
|
981
1128
|
)
|
|
982
1129
|
|
|
983
1130
|
for golden in goldens:
|
|
984
|
-
|
|
985
|
-
count += 1
|
|
986
|
-
total_tags = count_observe_decorators_in_module(
|
|
987
|
-
observed_callback
|
|
988
|
-
)
|
|
989
|
-
pbar_tags_id = add_pbar(
|
|
990
|
-
progress,
|
|
991
|
-
f" ⚡ Invoking observed callback (#{count})",
|
|
992
|
-
total=total_tags,
|
|
993
|
-
)
|
|
994
|
-
|
|
995
|
-
with Observer(
|
|
996
|
-
"custom",
|
|
997
|
-
func_name="Test Wrapper",
|
|
998
|
-
_progress=progress,
|
|
999
|
-
_pbar_callback_id=pbar_tags_id,
|
|
1000
|
-
):
|
|
1001
|
-
|
|
1002
|
-
if asyncio.iscoroutinefunction(observed_callback):
|
|
1003
|
-
loop = get_or_create_event_loop()
|
|
1004
|
-
coro = observed_callback(golden.input)
|
|
1005
|
-
loop.run_until_complete(
|
|
1006
|
-
asyncio.wait_for(
|
|
1007
|
-
coro,
|
|
1008
|
-
timeout=_per_task_timeout(),
|
|
1009
|
-
)
|
|
1010
|
-
)
|
|
1011
|
-
else:
|
|
1012
|
-
observed_callback(golden.input)
|
|
1013
|
-
current_trace: Trace = current_trace_context.get()
|
|
1014
|
-
|
|
1015
|
-
update_pbar(progress, pbar_tags_id, advance=total_tags)
|
|
1016
|
-
update_pbar(progress, pbar_id)
|
|
1017
|
-
|
|
1018
|
-
# Create empty trace api for llm api test case
|
|
1019
|
-
trace_api = create_api_trace(current_trace, golden)
|
|
1020
|
-
|
|
1021
|
-
# Format golden as test case to create llm api test case
|
|
1022
|
-
test_case = LLMTestCase(
|
|
1023
|
-
input=golden.input,
|
|
1024
|
-
actual_output=(
|
|
1025
|
-
str(current_trace.output)
|
|
1026
|
-
if current_trace.output is not None
|
|
1027
|
-
else None
|
|
1028
|
-
),
|
|
1029
|
-
expected_output=current_trace.expected_output,
|
|
1030
|
-
context=current_trace.context,
|
|
1031
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1032
|
-
additional_metadata=golden.additional_metadata,
|
|
1033
|
-
tools_called=current_trace.tools_called,
|
|
1034
|
-
expected_tools=current_trace.expected_tools,
|
|
1035
|
-
comments=golden.comments,
|
|
1036
|
-
name=golden.name,
|
|
1037
|
-
_dataset_alias=golden._dataset_alias,
|
|
1038
|
-
_dataset_id=golden._dataset_id,
|
|
1039
|
-
)
|
|
1040
|
-
api_test_case = create_api_test_case(
|
|
1041
|
-
test_case=test_case,
|
|
1042
|
-
trace=trace_api,
|
|
1043
|
-
index=count if not _is_assert_test else None,
|
|
1044
|
-
)
|
|
1131
|
+
count += 1
|
|
1045
1132
|
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1133
|
+
pbar_case_increments = (
|
|
1134
|
+
0 # tracks how many times we advance `pbar_id` for this golden
|
|
1135
|
+
)
|
|
1136
|
+
emitted_trace = set()
|
|
1137
|
+
current_trace: Optional[Trace] = None
|
|
1138
|
+
trace_api = None
|
|
1139
|
+
api_test_case = None
|
|
1140
|
+
test_case = None
|
|
1141
|
+
|
|
1142
|
+
def _run_golden():
|
|
1143
|
+
nonlocal current_trace, trace_api, api_test_case, test_case, pbar_case_increments
|
|
1144
|
+
# keep the evaluation context inside the timed function
|
|
1145
|
+
with capture_evaluation_run("golden"):
|
|
1146
|
+
total_tags = count_observe_decorators_in_module(
|
|
1147
|
+
observed_callback
|
|
1148
|
+
)
|
|
1149
|
+
pbar_tags_id = add_pbar(
|
|
1150
|
+
progress,
|
|
1151
|
+
f" ⚡ Invoking observed callback (#{count})",
|
|
1152
|
+
total=total_tags,
|
|
1056
1153
|
)
|
|
1057
1154
|
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1073
|
-
api_span.error = span.error or _trace_error(
|
|
1074
|
-
current_trace
|
|
1075
|
-
)
|
|
1076
|
-
if progress and pbar_eval_id is not None:
|
|
1077
|
-
update_pbar(
|
|
1078
|
-
progress,
|
|
1079
|
-
pbar_eval_id,
|
|
1080
|
-
advance=count_metrics_in_span_subtree(span),
|
|
1155
|
+
with Observer(
|
|
1156
|
+
"custom",
|
|
1157
|
+
func_name="Test Wrapper",
|
|
1158
|
+
_progress=progress,
|
|
1159
|
+
_pbar_callback_id=pbar_tags_id,
|
|
1160
|
+
):
|
|
1161
|
+
if asyncio.iscoroutinefunction(observed_callback):
|
|
1162
|
+
loop = get_or_create_event_loop()
|
|
1163
|
+
coro = observed_callback(golden.input)
|
|
1164
|
+
loop.run_until_complete(
|
|
1165
|
+
_await_with_outer_deadline(
|
|
1166
|
+
coro,
|
|
1167
|
+
timeout=_per_task_timeout(),
|
|
1168
|
+
)
|
|
1081
1169
|
)
|
|
1082
|
-
|
|
1170
|
+
else:
|
|
1171
|
+
observed_callback(golden.input)
|
|
1083
1172
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1173
|
+
# we have a trace now
|
|
1174
|
+
current_trace = current_trace_context.get()
|
|
1086
1175
|
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1176
|
+
update_pbar(progress, pbar_tags_id, advance=total_tags)
|
|
1177
|
+
update_pbar(progress, pbar_id)
|
|
1178
|
+
pbar_case_increments += 1
|
|
1179
|
+
|
|
1180
|
+
# Create empty trace api for llm api test case
|
|
1181
|
+
trace_api = create_api_trace(current_trace, golden)
|
|
1182
|
+
|
|
1183
|
+
# Build the test case and api test case
|
|
1184
|
+
test_case = LLMTestCase(
|
|
1185
|
+
input=golden.input,
|
|
1186
|
+
actual_output=(
|
|
1187
|
+
str(current_trace.output)
|
|
1188
|
+
if current_trace
|
|
1189
|
+
and current_trace.output is not None
|
|
1190
|
+
else None
|
|
1191
|
+
),
|
|
1192
|
+
expected_output=(
|
|
1193
|
+
current_trace.expected_output
|
|
1194
|
+
if current_trace
|
|
1195
|
+
else None
|
|
1196
|
+
),
|
|
1197
|
+
context=(
|
|
1198
|
+
current_trace.context if current_trace else None
|
|
1199
|
+
),
|
|
1200
|
+
retrieval_context=(
|
|
1201
|
+
current_trace.retrieval_context
|
|
1202
|
+
if current_trace
|
|
1203
|
+
else None
|
|
1204
|
+
),
|
|
1205
|
+
additional_metadata=golden.additional_metadata,
|
|
1206
|
+
tools_called=(
|
|
1207
|
+
current_trace.tools_called
|
|
1208
|
+
if current_trace
|
|
1209
|
+
else None
|
|
1210
|
+
),
|
|
1211
|
+
expected_tools=(
|
|
1212
|
+
current_trace.expected_tools
|
|
1213
|
+
if current_trace
|
|
1214
|
+
else None
|
|
1215
|
+
),
|
|
1216
|
+
comments=golden.comments,
|
|
1217
|
+
name=golden.name,
|
|
1218
|
+
_dataset_alias=golden._dataset_alias,
|
|
1219
|
+
_dataset_id=golden._dataset_id,
|
|
1220
|
+
)
|
|
1221
|
+
api_test_case = create_api_test_case(
|
|
1222
|
+
test_case=test_case,
|
|
1223
|
+
trace=trace_api,
|
|
1224
|
+
index=count if not _is_assert_test else None,
|
|
1091
1225
|
)
|
|
1092
1226
|
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
)
|
|
1102
|
-
expected_output=span.expected_output,
|
|
1103
|
-
context=span.context,
|
|
1104
|
-
retrieval_context=span.retrieval_context,
|
|
1105
|
-
tools_called=span.tools_called,
|
|
1106
|
-
expected_tools=span.expected_tools,
|
|
1227
|
+
# DFS and trace metric evaluation
|
|
1228
|
+
def dfs(
|
|
1229
|
+
span: BaseSpan,
|
|
1230
|
+
progress: Optional[Progress] = None,
|
|
1231
|
+
pbar_eval_id: Optional[int] = None,
|
|
1232
|
+
):
|
|
1233
|
+
metrics: List[BaseMetric] = list(span.metrics or [])
|
|
1234
|
+
api_span: BaseApiSpan = (
|
|
1235
|
+
trace_manager._convert_span_to_api_span(span)
|
|
1107
1236
|
)
|
|
1108
1237
|
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1238
|
+
if isinstance(span, AgentSpan):
|
|
1239
|
+
trace_api.agent_spans.append(api_span)
|
|
1240
|
+
elif isinstance(span, LlmSpan):
|
|
1241
|
+
trace_api.llm_spans.append(api_span)
|
|
1242
|
+
log_prompt(span, test_run_manager)
|
|
1243
|
+
elif isinstance(span, RetrieverSpan):
|
|
1244
|
+
trace_api.retriever_spans.append(api_span)
|
|
1245
|
+
elif isinstance(span, ToolSpan):
|
|
1246
|
+
trace_api.tool_spans.append(api_span)
|
|
1247
|
+
else:
|
|
1248
|
+
trace_api.base_spans.append(api_span)
|
|
1249
|
+
|
|
1250
|
+
if _skip_metrics_for_error(
|
|
1251
|
+
span=span, trace=current_trace
|
|
1252
|
+
):
|
|
1118
1253
|
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1119
|
-
api_span.error =
|
|
1120
|
-
|
|
1121
|
-
"Span has metrics but no LLMTestCase. "
|
|
1122
|
-
"Are you sure you called `update_current_span()`?"
|
|
1123
|
-
)
|
|
1254
|
+
api_span.error = span.error or _trace_error(
|
|
1255
|
+
current_trace
|
|
1124
1256
|
)
|
|
1125
1257
|
if progress and pbar_eval_id is not None:
|
|
1126
1258
|
update_pbar(
|
|
@@ -1130,155 +1262,386 @@ def execute_agentic_test_cases(
|
|
|
1130
1262
|
)
|
|
1131
1263
|
return
|
|
1132
1264
|
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
metric.skipped = False
|
|
1137
|
-
metric.error = None
|
|
1138
|
-
if display_config.verbose_mode is not None:
|
|
1139
|
-
metric.verbose_mode = display_config.verbose_mode
|
|
1140
|
-
|
|
1141
|
-
# Metric calculation
|
|
1142
|
-
for metric in metrics:
|
|
1143
|
-
metric_data = None
|
|
1144
|
-
res = _execute_metric(
|
|
1145
|
-
metric=metric,
|
|
1146
|
-
test_case=llm_test_case,
|
|
1147
|
-
show_metric_indicator=show_metric_indicator,
|
|
1148
|
-
in_component=True,
|
|
1149
|
-
error_config=error_config,
|
|
1150
|
-
)
|
|
1151
|
-
if res == "skip":
|
|
1152
|
-
continue
|
|
1153
|
-
metric_data = create_metric_data(metric)
|
|
1154
|
-
api_span.metrics_data.append(metric_data)
|
|
1155
|
-
api_test_case.update_status(metric_data.success)
|
|
1156
|
-
update_pbar(progress, pbar_eval_id)
|
|
1157
|
-
|
|
1158
|
-
trace_level_metrics_count = (
|
|
1159
|
-
len(current_trace.metrics) if current_trace.metrics else 0
|
|
1160
|
-
)
|
|
1161
|
-
pbar_eval_id = add_pbar(
|
|
1162
|
-
progress,
|
|
1163
|
-
f" 🎯 Evaluating component(s) (#{count})",
|
|
1164
|
-
total=count_metrics_in_trace(trace=current_trace)
|
|
1165
|
-
+ trace_level_metrics_count,
|
|
1166
|
-
)
|
|
1167
|
-
|
|
1168
|
-
start_time = time.perf_counter()
|
|
1265
|
+
# evaluate children first
|
|
1266
|
+
for child in span.children:
|
|
1267
|
+
dfs(child, progress, pbar_eval_id)
|
|
1169
1268
|
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
pbar_eval_id,
|
|
1178
|
-
advance=count_total_metrics_for_trace(
|
|
1179
|
-
current_trace
|
|
1180
|
-
),
|
|
1269
|
+
# If there are no metrics, then there is nothing to do on this span.
|
|
1270
|
+
if not metrics:
|
|
1271
|
+
return
|
|
1272
|
+
|
|
1273
|
+
has_task_completion = any(
|
|
1274
|
+
isinstance(metric, TaskCompletionMetric)
|
|
1275
|
+
for metric in metrics
|
|
1181
1276
|
)
|
|
1182
|
-
|
|
1183
|
-
if current_trace.metrics:
|
|
1277
|
+
|
|
1184
1278
|
requires_trace = any(
|
|
1185
|
-
metric
|
|
1186
|
-
for metric in
|
|
1279
|
+
getattr(metric, "requires_trace", False)
|
|
1280
|
+
for metric in metrics
|
|
1187
1281
|
)
|
|
1188
1282
|
|
|
1189
1283
|
llm_test_case = None
|
|
1190
|
-
if
|
|
1284
|
+
if span.input is not None:
|
|
1191
1285
|
llm_test_case = LLMTestCase(
|
|
1192
|
-
input=str(
|
|
1286
|
+
input=str(span.input),
|
|
1193
1287
|
actual_output=(
|
|
1194
|
-
str(
|
|
1195
|
-
if
|
|
1288
|
+
str(span.output)
|
|
1289
|
+
if span.output is not None
|
|
1196
1290
|
else None
|
|
1197
1291
|
),
|
|
1198
|
-
expected_output=
|
|
1199
|
-
context=
|
|
1200
|
-
retrieval_context=
|
|
1201
|
-
tools_called=
|
|
1202
|
-
expected_tools=
|
|
1292
|
+
expected_output=span.expected_output,
|
|
1293
|
+
context=span.context,
|
|
1294
|
+
retrieval_context=span.retrieval_context,
|
|
1295
|
+
tools_called=span.tools_called,
|
|
1296
|
+
expected_tools=span.expected_tools,
|
|
1203
1297
|
)
|
|
1204
|
-
|
|
1298
|
+
|
|
1299
|
+
# If any metric needs a trace tree or a completion verdict, attach the trace
|
|
1300
|
+
if has_task_completion or requires_trace:
|
|
1205
1301
|
if llm_test_case is None:
|
|
1206
1302
|
llm_test_case = LLMTestCase(input="None")
|
|
1207
1303
|
llm_test_case._trace_dict = (
|
|
1208
|
-
trace_manager.create_nested_spans_dict(
|
|
1209
|
-
current_trace.root_spans[0]
|
|
1210
|
-
)
|
|
1304
|
+
trace_manager.create_nested_spans_dict(span)
|
|
1211
1305
|
)
|
|
1212
1306
|
else:
|
|
1307
|
+
# Without a test case we cannot evaluate span metrics
|
|
1213
1308
|
if llm_test_case is None:
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
)
|
|
1220
|
-
current_trace.root_spans[0].error = (
|
|
1221
|
-
format_error_text(
|
|
1222
|
-
DeepEvalError(
|
|
1223
|
-
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
1224
|
-
"Are you sure you called `update_current_trace()`?"
|
|
1225
|
-
)
|
|
1226
|
-
)
|
|
1309
|
+
api_span.status = TraceSpanApiStatus.ERRORED
|
|
1310
|
+
api_span.error = format_error_text(
|
|
1311
|
+
DeepEvalError(
|
|
1312
|
+
"Span has metrics but no LLMTestCase. "
|
|
1313
|
+
"Are you sure you called `update_current_span()`?"
|
|
1227
1314
|
)
|
|
1315
|
+
)
|
|
1228
1316
|
if progress and pbar_eval_id is not None:
|
|
1229
1317
|
update_pbar(
|
|
1230
1318
|
progress,
|
|
1231
1319
|
pbar_eval_id,
|
|
1232
|
-
advance=
|
|
1233
|
-
|
|
1320
|
+
advance=count_metrics_in_span_subtree(
|
|
1321
|
+
span
|
|
1234
1322
|
),
|
|
1235
1323
|
)
|
|
1236
|
-
|
|
1324
|
+
return
|
|
1325
|
+
|
|
1326
|
+
# Preparing metric calculation
|
|
1327
|
+
api_span.metrics_data = []
|
|
1328
|
+
for metric in metrics:
|
|
1329
|
+
metric.skipped = False
|
|
1330
|
+
metric.error = None
|
|
1331
|
+
if display_config.verbose_mode is not None:
|
|
1332
|
+
metric.verbose_mode = (
|
|
1333
|
+
display_config.verbose_mode
|
|
1334
|
+
)
|
|
1237
1335
|
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
metric
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1336
|
+
# Metric calculation
|
|
1337
|
+
for metric in metrics:
|
|
1338
|
+
res = _execute_metric(
|
|
1339
|
+
metric=metric,
|
|
1340
|
+
test_case=llm_test_case,
|
|
1341
|
+
show_metric_indicator=show_metric_indicator,
|
|
1342
|
+
in_component=True,
|
|
1343
|
+
error_config=error_config,
|
|
1344
|
+
)
|
|
1345
|
+
if res == "skip":
|
|
1346
|
+
continue
|
|
1347
|
+
metric_data = create_metric_data(metric)
|
|
1348
|
+
api_span.metrics_data.append(metric_data)
|
|
1349
|
+
api_test_case.update_status(metric_data.success)
|
|
1350
|
+
update_pbar(progress, pbar_eval_id)
|
|
1351
|
+
|
|
1352
|
+
trace_level_metrics_count = (
|
|
1353
|
+
len(current_trace.metrics)
|
|
1354
|
+
if current_trace and current_trace.metrics
|
|
1355
|
+
else 0
|
|
1356
|
+
)
|
|
1357
|
+
pbar_eval_id = add_pbar(
|
|
1358
|
+
progress,
|
|
1359
|
+
f" 🎯 Evaluating component(s) (#{count})",
|
|
1360
|
+
total=count_metrics_in_trace(trace=current_trace)
|
|
1361
|
+
+ trace_level_metrics_count,
|
|
1362
|
+
)
|
|
1363
|
+
|
|
1364
|
+
start_time = time.perf_counter()
|
|
1365
|
+
|
|
1366
|
+
skip_metrics_for_this_golden = False
|
|
1367
|
+
if _skip_metrics_for_error(trace=current_trace):
|
|
1368
|
+
trace_api.status = TraceSpanApiStatus.ERRORED
|
|
1369
|
+
if progress and pbar_eval_id is not None:
|
|
1370
|
+
update_pbar(
|
|
1371
|
+
progress,
|
|
1372
|
+
pbar_eval_id,
|
|
1373
|
+
advance=count_total_metrics_for_trace(
|
|
1374
|
+
current_trace
|
|
1375
|
+
),
|
|
1376
|
+
)
|
|
1377
|
+
else:
|
|
1378
|
+
if current_trace and current_trace.metrics:
|
|
1379
|
+
has_task_completion = any(
|
|
1380
|
+
isinstance(metric, TaskCompletionMetric)
|
|
1381
|
+
for metric in current_trace.metrics
|
|
1382
|
+
)
|
|
1383
|
+
requires_trace = any(
|
|
1384
|
+
getattr(metric, "requires_trace", False)
|
|
1385
|
+
for metric in current_trace.metrics
|
|
1386
|
+
)
|
|
1387
|
+
llm_test_case = None
|
|
1388
|
+
if current_trace.input:
|
|
1389
|
+
llm_test_case = LLMTestCase(
|
|
1390
|
+
input=str(current_trace.input),
|
|
1391
|
+
actual_output=(
|
|
1392
|
+
str(current_trace.output)
|
|
1393
|
+
if current_trace.output is not None
|
|
1394
|
+
else None
|
|
1395
|
+
),
|
|
1396
|
+
expected_output=current_trace.expected_output,
|
|
1397
|
+
context=current_trace.context,
|
|
1398
|
+
retrieval_context=current_trace.retrieval_context,
|
|
1399
|
+
tools_called=current_trace.tools_called,
|
|
1400
|
+
expected_tools=current_trace.expected_tools,
|
|
1401
|
+
)
|
|
1402
|
+
if has_task_completion or requires_trace:
|
|
1403
|
+
if llm_test_case is None:
|
|
1404
|
+
llm_test_case = LLMTestCase(input="None")
|
|
1405
|
+
llm_test_case._trace_dict = (
|
|
1406
|
+
trace_manager.create_nested_spans_dict(
|
|
1407
|
+
current_trace.root_spans[0]
|
|
1408
|
+
)
|
|
1409
|
+
)
|
|
1410
|
+
else:
|
|
1411
|
+
if llm_test_case is None:
|
|
1412
|
+
current_trace.status = (
|
|
1413
|
+
TraceSpanStatus.ERRORED
|
|
1414
|
+
)
|
|
1415
|
+
trace_api.status = (
|
|
1416
|
+
TraceSpanApiStatus.ERRORED
|
|
1245
1417
|
)
|
|
1418
|
+
if current_trace.root_spans:
|
|
1419
|
+
current_trace.root_spans[0].status = (
|
|
1420
|
+
TraceSpanStatus.ERRORED
|
|
1421
|
+
)
|
|
1422
|
+
current_trace.root_spans[0].error = (
|
|
1423
|
+
format_error_text(
|
|
1424
|
+
DeepEvalError(
|
|
1425
|
+
"Trace has metrics but no LLMTestCase (missing input/output). "
|
|
1426
|
+
"Are you sure you called `update_current_trace()`?"
|
|
1427
|
+
)
|
|
1428
|
+
)
|
|
1429
|
+
)
|
|
1430
|
+
if progress and pbar_eval_id is not None:
|
|
1431
|
+
update_pbar(
|
|
1432
|
+
progress,
|
|
1433
|
+
pbar_eval_id,
|
|
1434
|
+
advance=count_total_metrics_for_trace(
|
|
1435
|
+
current_trace
|
|
1436
|
+
),
|
|
1437
|
+
)
|
|
1438
|
+
skip_metrics_for_this_golden = True
|
|
1439
|
+
|
|
1440
|
+
if not skip_metrics_for_this_golden:
|
|
1441
|
+
for metric in current_trace.metrics:
|
|
1442
|
+
metric.skipped = False
|
|
1443
|
+
metric.error = None
|
|
1444
|
+
if display_config.verbose_mode is not None:
|
|
1445
|
+
metric.verbose_mode = (
|
|
1446
|
+
display_config.verbose_mode
|
|
1447
|
+
)
|
|
1246
1448
|
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1449
|
+
trace_api.metrics_data = []
|
|
1450
|
+
for metric in current_trace.metrics:
|
|
1451
|
+
res = _execute_metric(
|
|
1452
|
+
metric=metric,
|
|
1453
|
+
test_case=llm_test_case,
|
|
1454
|
+
show_metric_indicator=show_metric_indicator,
|
|
1455
|
+
in_component=True,
|
|
1456
|
+
error_config=error_config,
|
|
1457
|
+
)
|
|
1458
|
+
if res == "skip":
|
|
1459
|
+
continue
|
|
1460
|
+
|
|
1461
|
+
if not metric.skipped:
|
|
1462
|
+
metric_data = create_metric_data(metric)
|
|
1463
|
+
trace_api.metrics_data.append(
|
|
1464
|
+
metric_data
|
|
1465
|
+
)
|
|
1466
|
+
api_test_case.update_metric_data(
|
|
1467
|
+
metric_data
|
|
1468
|
+
)
|
|
1469
|
+
api_test_case.update_status(
|
|
1470
|
+
metric_data.success
|
|
1471
|
+
)
|
|
1472
|
+
emitted_trace.add(id(metric))
|
|
1473
|
+
update_pbar(progress, pbar_eval_id)
|
|
1474
|
+
|
|
1475
|
+
# handle span metrics
|
|
1476
|
+
dfs(
|
|
1477
|
+
current_trace.root_spans[0],
|
|
1478
|
+
progress,
|
|
1479
|
+
pbar_eval_id,
|
|
1480
|
+
)
|
|
1481
|
+
|
|
1482
|
+
# TODO: Do I need this block, or is it duplicated in finally?
|
|
1483
|
+
end_time = time.perf_counter()
|
|
1484
|
+
run_duration = end_time - start_time
|
|
1485
|
+
api_test_case.update_run_duration(run_duration)
|
|
1486
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1487
|
+
test_results.append(create_test_result(api_test_case))
|
|
1488
|
+
test_results.extend(extract_trace_test_results(trace_api))
|
|
1489
|
+
update_pbar(progress, pbar_id)
|
|
1490
|
+
pbar_case_increments += 1
|
|
1491
|
+
|
|
1492
|
+
# run the golden with a timeout
|
|
1493
|
+
start_time = time.perf_counter()
|
|
1494
|
+
deadline = _per_task_timeout()
|
|
1495
|
+
|
|
1496
|
+
try:
|
|
1497
|
+
run_sync_with_timeout(_run_golden, deadline)
|
|
1498
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
1499
|
+
# mark any not yet finished trace level and span level metrics as timed out.
|
|
1500
|
+
msg = (
|
|
1501
|
+
f"Timed out after {deadline:.2f}s while executing agentic test case. "
|
|
1502
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1503
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
if current_trace is not None:
|
|
1507
|
+
# Trace-level metrics
|
|
1508
|
+
if getattr(current_trace, "metrics", None):
|
|
1509
|
+
for m in current_trace.metrics:
|
|
1510
|
+
if getattr(m, "skipped", False):
|
|
1511
|
+
continue
|
|
1512
|
+
# if already has a terminal state, leave it alone
|
|
1513
|
+
if getattr(
|
|
1514
|
+
m, "success", None
|
|
1515
|
+
) is not None or getattr(m, "error", None):
|
|
1516
|
+
continue
|
|
1517
|
+
m.success = False
|
|
1518
|
+
m.error = msg
|
|
1519
|
+
|
|
1520
|
+
# span level metrics, walk the tree
|
|
1521
|
+
def _walk(span):
|
|
1522
|
+
for child in getattr(span, "children", []) or []:
|
|
1523
|
+
_walk(child)
|
|
1524
|
+
for m in list(getattr(span, "metrics", []) or []):
|
|
1525
|
+
if getattr(m, "skipped", False):
|
|
1526
|
+
continue
|
|
1527
|
+
if getattr(
|
|
1528
|
+
m, "success", None
|
|
1529
|
+
) is not None or getattr(m, "error", None):
|
|
1530
|
+
continue
|
|
1531
|
+
m.success = False
|
|
1532
|
+
m.error = msg
|
|
1533
|
+
|
|
1534
|
+
for root in getattr(current_trace, "root_spans", []) or []:
|
|
1535
|
+
_walk(root)
|
|
1536
|
+
|
|
1537
|
+
# raise if we are not ignoring errors
|
|
1538
|
+
if not error_config.ignore_errors:
|
|
1539
|
+
raise
|
|
1540
|
+
|
|
1541
|
+
finally:
|
|
1542
|
+
try:
|
|
1543
|
+
# Ensure we have an api_test_case to attach results to.
|
|
1544
|
+
if api_test_case is None:
|
|
1545
|
+
# build a minimal test_case
|
|
1546
|
+
if test_case is None:
|
|
1547
|
+
out = (
|
|
1548
|
+
str(current_trace.output)
|
|
1549
|
+
if (
|
|
1550
|
+
current_trace is not None
|
|
1551
|
+
and current_trace.output is not None
|
|
1255
1552
|
)
|
|
1256
|
-
|
|
1553
|
+
else None
|
|
1554
|
+
)
|
|
1555
|
+
test_case = LLMTestCase(
|
|
1556
|
+
input=golden.input,
|
|
1557
|
+
actual_output=out,
|
|
1558
|
+
expected_output=(
|
|
1559
|
+
current_trace.expected_output
|
|
1560
|
+
if current_trace
|
|
1561
|
+
else None
|
|
1562
|
+
),
|
|
1563
|
+
context=(
|
|
1564
|
+
current_trace.context
|
|
1565
|
+
if current_trace
|
|
1566
|
+
else None
|
|
1567
|
+
),
|
|
1568
|
+
retrieval_context=(
|
|
1569
|
+
current_trace.retrieval_context
|
|
1570
|
+
if current_trace
|
|
1571
|
+
else None
|
|
1572
|
+
),
|
|
1573
|
+
additional_metadata=golden.additional_metadata,
|
|
1574
|
+
tools_called=(
|
|
1575
|
+
current_trace.tools_called
|
|
1576
|
+
if current_trace
|
|
1577
|
+
else None
|
|
1578
|
+
),
|
|
1579
|
+
expected_tools=(
|
|
1580
|
+
current_trace.expected_tools
|
|
1581
|
+
if current_trace
|
|
1582
|
+
else None
|
|
1583
|
+
),
|
|
1584
|
+
comments=golden.comments,
|
|
1585
|
+
name=golden.name,
|
|
1586
|
+
_dataset_alias=golden._dataset_alias,
|
|
1587
|
+
_dataset_id=golden._dataset_id,
|
|
1588
|
+
)
|
|
1589
|
+
|
|
1590
|
+
# Create a trace API if we have a trace
|
|
1591
|
+
if trace_api is None and current_trace is not None:
|
|
1592
|
+
trace_api = create_api_trace(current_trace, golden)
|
|
1593
|
+
|
|
1594
|
+
api_test_case = create_api_test_case(
|
|
1595
|
+
test_case=test_case,
|
|
1596
|
+
trace=trace_api,
|
|
1597
|
+
index=count if not _is_assert_test else None,
|
|
1598
|
+
)
|
|
1599
|
+
|
|
1600
|
+
if test_run is not None:
|
|
1601
|
+
test_run_manager.set_test_run(test_run)
|
|
1602
|
+
|
|
1603
|
+
if api_test_case.success is None:
|
|
1604
|
+
api_test_case.update_status(False)
|
|
1605
|
+
|
|
1606
|
+
# try to update metric data
|
|
1607
|
+
if current_trace is not None:
|
|
1608
|
+
if current_trace.metrics:
|
|
1609
|
+
for m in current_trace.metrics:
|
|
1610
|
+
if getattr(m, "skipped", False):
|
|
1611
|
+
continue
|
|
1612
|
+
if id(m) in emitted_trace:
|
|
1257
1613
|
continue
|
|
1614
|
+
api_test_case.update_metric_data(
|
|
1615
|
+
create_metric_data(m)
|
|
1616
|
+
)
|
|
1258
1617
|
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
)
|
|
1265
|
-
api_test_case.update_status(
|
|
1266
|
-
metric_data.success
|
|
1267
|
-
)
|
|
1268
|
-
update_pbar(progress, pbar_eval_id)
|
|
1618
|
+
# Finalize duration and persist
|
|
1619
|
+
elapsed = time.perf_counter() - start_time
|
|
1620
|
+
api_test_case.update_run_duration(
|
|
1621
|
+
elapsed if elapsed >= 0 else deadline
|
|
1622
|
+
)
|
|
1269
1623
|
|
|
1270
|
-
|
|
1271
|
-
|
|
1624
|
+
if (
|
|
1625
|
+
api_test_case.metrics_data == []
|
|
1626
|
+
and api_test_case.trace is None
|
|
1627
|
+
):
|
|
1628
|
+
api_test_case.metrics_data = None
|
|
1272
1629
|
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
# Update test run
|
|
1276
|
-
api_test_case.update_run_duration(run_duration)
|
|
1277
|
-
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1278
|
-
test_results.append(create_test_result(api_test_case))
|
|
1279
|
-
test_results.extend(extract_trace_test_results(trace_api))
|
|
1630
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
1631
|
+
test_results.append(create_test_result(api_test_case))
|
|
1280
1632
|
|
|
1281
|
-
|
|
1633
|
+
if trace_api is not None:
|
|
1634
|
+
test_results.extend(
|
|
1635
|
+
extract_trace_test_results(trace_api)
|
|
1636
|
+
)
|
|
1637
|
+
|
|
1638
|
+
missing = 2 - pbar_case_increments
|
|
1639
|
+
if missing > 0:
|
|
1640
|
+
update_pbar(progress, pbar_id, advance=missing)
|
|
1641
|
+
|
|
1642
|
+
finally:
|
|
1643
|
+
# nothing to clean here, but keep symmetry with other paths
|
|
1644
|
+
pass
|
|
1282
1645
|
|
|
1283
1646
|
if display_config.show_indicator and _use_bar_indicator:
|
|
1284
1647
|
progress = Progress(
|
|
@@ -1319,9 +1682,9 @@ async def a_execute_agentic_test_cases(
|
|
|
1319
1682
|
|
|
1320
1683
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
1321
1684
|
async with semaphore:
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
timeout=
|
|
1685
|
+
timeout = _per_task_timeout()
|
|
1686
|
+
return await _await_with_outer_deadline(
|
|
1687
|
+
func, *args, timeout=timeout, **kwargs
|
|
1325
1688
|
)
|
|
1326
1689
|
|
|
1327
1690
|
test_run_manager = global_test_run_manager
|
|
@@ -1374,7 +1737,7 @@ async def a_execute_agentic_test_cases(
|
|
|
1374
1737
|
asyncio.gather(*tasks),
|
|
1375
1738
|
timeout=_gather_timeout(),
|
|
1376
1739
|
)
|
|
1377
|
-
except asyncio.TimeoutError:
|
|
1740
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
1378
1741
|
# Cancel any still-pending tasks and drain them
|
|
1379
1742
|
for t in tasks:
|
|
1380
1743
|
if not t.done():
|
|
@@ -1426,94 +1789,89 @@ async def _a_execute_agentic_test_case(
|
|
|
1426
1789
|
progress: Optional[Progress] = None,
|
|
1427
1790
|
pbar_id: Optional[int] = None,
|
|
1428
1791
|
):
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1792
|
+
test_start_time = time.perf_counter()
|
|
1793
|
+
current_trace = None
|
|
1794
|
+
trace_api = None
|
|
1795
|
+
test_case = None
|
|
1796
|
+
api_test_case = None
|
|
1797
|
+
try:
|
|
1798
|
+
if observed_callback:
|
|
1799
|
+
total_tags = count_observe_decorators_in_module(observed_callback)
|
|
1800
|
+
pbar_tags_id = add_pbar(
|
|
1801
|
+
progress,
|
|
1802
|
+
f" ⚡ Invoking observed callback (#{count})",
|
|
1803
|
+
total=total_tags,
|
|
1804
|
+
)
|
|
1436
1805
|
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1806
|
+
# Call callback and extract trace
|
|
1807
|
+
with Observer(
|
|
1808
|
+
"custom",
|
|
1809
|
+
func_name="Test Wrapper",
|
|
1810
|
+
_progress=progress,
|
|
1811
|
+
_pbar_callback_id=pbar_tags_id,
|
|
1812
|
+
):
|
|
1813
|
+
# get current_trace right away, we need it even if cancelled
|
|
1814
|
+
current_trace: Trace = current_trace_context.get()
|
|
1815
|
+
if asyncio.iscoroutinefunction(observed_callback):
|
|
1816
|
+
await _await_with_outer_deadline(
|
|
1817
|
+
observed_callback,
|
|
1818
|
+
golden.input,
|
|
1819
|
+
timeout=_per_task_timeout(),
|
|
1820
|
+
)
|
|
1821
|
+
else:
|
|
1822
|
+
observed_callback(golden.input)
|
|
1452
1823
|
|
|
1453
|
-
|
|
1454
|
-
|
|
1824
|
+
update_pbar(progress, pbar_tags_id, advance=total_tags)
|
|
1825
|
+
update_pbar(progress, pbar_id)
|
|
1455
1826
|
|
|
1456
|
-
|
|
1457
|
-
|
|
1827
|
+
elif trace:
|
|
1828
|
+
current_trace = trace
|
|
1458
1829
|
|
|
1459
|
-
|
|
1460
|
-
current_trace.metrics = trace_metrics
|
|
1830
|
+
trace_level_metrics_count = 0
|
|
1461
1831
|
|
|
1462
|
-
|
|
1463
|
-
|
|
1832
|
+
if trace_metrics:
|
|
1833
|
+
current_trace.metrics = trace_metrics
|
|
1464
1834
|
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
)
|
|
1835
|
+
# run evals through DFS
|
|
1836
|
+
trace_api = create_api_trace(trace=current_trace, golden=golden)
|
|
1468
1837
|
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
total=count_metrics_in_trace(trace=current_trace)
|
|
1473
|
-
+ trace_level_metrics_count,
|
|
1474
|
-
)
|
|
1838
|
+
trace_level_metrics_count = (
|
|
1839
|
+
len(current_trace.metrics) if current_trace.metrics else 0
|
|
1840
|
+
)
|
|
1475
1841
|
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
),
|
|
1483
|
-
expected_output=current_trace.expected_output,
|
|
1484
|
-
context=current_trace.context,
|
|
1485
|
-
retrieval_context=current_trace.retrieval_context,
|
|
1486
|
-
tools_called=current_trace.tools_called,
|
|
1487
|
-
expected_tools=current_trace.expected_tools,
|
|
1488
|
-
additional_metadata=golden.additional_metadata,
|
|
1489
|
-
comments=golden.comments,
|
|
1490
|
-
name=golden.name,
|
|
1491
|
-
_dataset_alias=golden._dataset_alias,
|
|
1492
|
-
_dataset_id=golden._dataset_id,
|
|
1493
|
-
)
|
|
1494
|
-
api_test_case = create_api_test_case(
|
|
1495
|
-
test_case=test_case,
|
|
1496
|
-
trace=trace_api,
|
|
1497
|
-
index=count if not _is_assert_test else None,
|
|
1498
|
-
)
|
|
1842
|
+
pbar_eval_id = add_pbar(
|
|
1843
|
+
progress,
|
|
1844
|
+
f" 🎯 Evaluating component(s) (#{count})",
|
|
1845
|
+
total=count_metrics_in_trace(trace=current_trace)
|
|
1846
|
+
+ trace_level_metrics_count,
|
|
1847
|
+
)
|
|
1499
1848
|
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1849
|
+
test_case = LLMTestCase(
|
|
1850
|
+
input=golden.input,
|
|
1851
|
+
actual_output=(
|
|
1852
|
+
str(current_trace.output)
|
|
1853
|
+
if current_trace.output is not None
|
|
1854
|
+
else None
|
|
1855
|
+
),
|
|
1856
|
+
expected_output=current_trace.expected_output,
|
|
1857
|
+
context=current_trace.context,
|
|
1858
|
+
retrieval_context=current_trace.retrieval_context,
|
|
1859
|
+
tools_called=current_trace.tools_called,
|
|
1860
|
+
expected_tools=current_trace.expected_tools,
|
|
1861
|
+
additional_metadata=golden.additional_metadata,
|
|
1862
|
+
comments=golden.comments,
|
|
1863
|
+
name=golden.name,
|
|
1864
|
+
_dataset_alias=golden._dataset_alias,
|
|
1865
|
+
_dataset_id=golden._dataset_id,
|
|
1866
|
+
)
|
|
1867
|
+
api_test_case = create_api_test_case(
|
|
1868
|
+
test_case=test_case,
|
|
1869
|
+
trace=trace_api,
|
|
1870
|
+
index=count if not _is_assert_test else None,
|
|
1871
|
+
)
|
|
1512
1872
|
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
span=span,
|
|
1516
|
-
current_trace=trace,
|
|
1873
|
+
await _a_execute_trace_test_case(
|
|
1874
|
+
trace=current_trace,
|
|
1517
1875
|
trace_api=trace_api,
|
|
1518
1876
|
api_test_case=api_test_case,
|
|
1519
1877
|
ignore_errors=ignore_errors,
|
|
@@ -1522,56 +1880,150 @@ async def _a_execute_agentic_test_case(
|
|
|
1522
1880
|
verbose_mode=verbose_mode,
|
|
1523
1881
|
progress=progress,
|
|
1524
1882
|
pbar_eval_id=pbar_eval_id,
|
|
1525
|
-
test_run_manager=test_run_manager,
|
|
1526
1883
|
_use_bar_indicator=_use_bar_indicator,
|
|
1527
1884
|
)
|
|
1528
1885
|
|
|
1529
|
-
|
|
1530
|
-
|
|
1886
|
+
async def dfs(trace: Trace, span: BaseSpan):
|
|
1887
|
+
await _a_execute_span_test_case(
|
|
1888
|
+
span=span,
|
|
1889
|
+
current_trace=trace,
|
|
1890
|
+
trace_api=trace_api,
|
|
1891
|
+
api_test_case=api_test_case,
|
|
1892
|
+
ignore_errors=ignore_errors,
|
|
1893
|
+
skip_on_missing_params=skip_on_missing_params,
|
|
1894
|
+
show_indicator=show_indicator,
|
|
1895
|
+
verbose_mode=verbose_mode,
|
|
1896
|
+
progress=progress,
|
|
1897
|
+
pbar_eval_id=pbar_eval_id,
|
|
1898
|
+
test_run_manager=test_run_manager,
|
|
1899
|
+
_use_bar_indicator=_use_bar_indicator,
|
|
1900
|
+
)
|
|
1531
1901
|
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
]
|
|
1535
|
-
if child_tasks:
|
|
1536
|
-
try:
|
|
1537
|
-
await asyncio.wait_for(
|
|
1538
|
-
asyncio.gather(*child_tasks),
|
|
1539
|
-
timeout=_gather_timeout(),
|
|
1540
|
-
)
|
|
1541
|
-
except asyncio.TimeoutError:
|
|
1542
|
-
for t in child_tasks:
|
|
1543
|
-
if not t.done():
|
|
1544
|
-
t.cancel()
|
|
1545
|
-
await asyncio.gather(*child_tasks, return_exceptions=True)
|
|
1546
|
-
raise
|
|
1902
|
+
if _skip_metrics_for_error(span=span, trace=trace):
|
|
1903
|
+
return
|
|
1547
1904
|
|
|
1548
|
-
|
|
1905
|
+
child_tasks = [
|
|
1906
|
+
asyncio.create_task(dfs(trace, child))
|
|
1907
|
+
for child in span.children
|
|
1908
|
+
]
|
|
1909
|
+
if child_tasks:
|
|
1910
|
+
try:
|
|
1911
|
+
await asyncio.wait_for(
|
|
1912
|
+
asyncio.gather(*child_tasks),
|
|
1913
|
+
timeout=_gather_timeout(),
|
|
1914
|
+
)
|
|
1915
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
1916
|
+
for t in child_tasks:
|
|
1917
|
+
if not t.done():
|
|
1918
|
+
t.cancel()
|
|
1919
|
+
await asyncio.gather(*child_tasks, return_exceptions=True)
|
|
1920
|
+
raise
|
|
1549
1921
|
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1922
|
+
if not _skip_metrics_for_error(trace=current_trace):
|
|
1923
|
+
if current_trace and current_trace.root_spans:
|
|
1924
|
+
await dfs(current_trace, current_trace.root_spans[0])
|
|
1925
|
+
else:
|
|
1926
|
+
if (
|
|
1927
|
+
logger.isEnabledFor(logging.DEBUG)
|
|
1928
|
+
and get_settings().DEEPEVAL_VERBOSE_MODE
|
|
1929
|
+
):
|
|
1930
|
+
logger.debug(
|
|
1931
|
+
"Skipping DFS: empty trace or no root spans (trace=%s)",
|
|
1932
|
+
current_trace.uuid if current_trace else None,
|
|
1933
|
+
)
|
|
1934
|
+
except asyncio.CancelledError:
|
|
1935
|
+
# mark any unfinished metrics as cancelled
|
|
1936
|
+
cancel_msg = (
|
|
1937
|
+
"Timed out/cancelled while evaluating agentic test case. "
|
|
1938
|
+
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
1939
|
+
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
1940
|
+
)
|
|
1941
|
+
|
|
1942
|
+
if trace_metrics:
|
|
1943
|
+
for m in trace_metrics:
|
|
1944
|
+
if getattr(m, "skipped", False):
|
|
1945
|
+
continue
|
|
1946
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
1947
|
+
m, "error", None
|
|
1948
|
+
):
|
|
1949
|
+
m.success = False
|
|
1950
|
+
m.error = cancel_msg
|
|
1951
|
+
|
|
1952
|
+
if trace is not None and trace.metrics:
|
|
1953
|
+
for m in trace.metrics:
|
|
1954
|
+
if getattr(m, "skipped", False):
|
|
1955
|
+
continue
|
|
1956
|
+
if getattr(m, "success", None) is None and not getattr(
|
|
1957
|
+
m, "error", None
|
|
1958
|
+
):
|
|
1959
|
+
m.success = False
|
|
1960
|
+
m.error = cancel_msg
|
|
1961
|
+
if not ignore_errors:
|
|
1962
|
+
raise
|
|
1963
|
+
finally:
|
|
1964
|
+
try:
|
|
1965
|
+
if api_test_case is None:
|
|
1966
|
+
if test_case is None:
|
|
1967
|
+
test_case = LLMTestCase(
|
|
1968
|
+
input=golden.input,
|
|
1969
|
+
actual_output=None,
|
|
1970
|
+
expected_output=None,
|
|
1971
|
+
context=None,
|
|
1972
|
+
retrieval_context=None,
|
|
1973
|
+
additional_metadata=golden.additional_metadata,
|
|
1974
|
+
tools_called=None,
|
|
1975
|
+
expected_tools=None,
|
|
1976
|
+
comments=golden.comments,
|
|
1977
|
+
name=golden.name,
|
|
1978
|
+
_dataset_alias=golden._dataset_alias,
|
|
1979
|
+
_dataset_id=golden._dataset_id,
|
|
1980
|
+
)
|
|
1981
|
+
if trace is not None and trace_api is None:
|
|
1982
|
+
trace_api = create_api_trace(trace, golden)
|
|
1983
|
+
|
|
1984
|
+
api_test_case = create_api_test_case(
|
|
1985
|
+
test_case=test_case,
|
|
1986
|
+
trace=trace_api,
|
|
1987
|
+
index=(count if not _is_assert_test else None),
|
|
1561
1988
|
)
|
|
1562
1989
|
|
|
1563
|
-
|
|
1564
|
-
|
|
1990
|
+
# attach MetricData for any trace metrics we marked above
|
|
1991
|
+
if trace_metrics:
|
|
1992
|
+
for m in trace_metrics:
|
|
1993
|
+
if getattr(m, "skipped", False):
|
|
1994
|
+
continue
|
|
1995
|
+
api_test_case.update_metric_data(create_metric_data(m))
|
|
1996
|
+
|
|
1997
|
+
# If nothing set success yet, mark the case failed
|
|
1998
|
+
if api_test_case.success is None:
|
|
1999
|
+
api_test_case.update_status(False)
|
|
1565
2000
|
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
unique_trace_results = filter_duplicate_results(main_result, trace_results)
|
|
1571
|
-
test_results.append(main_result)
|
|
1572
|
-
test_results.extend(unique_trace_results)
|
|
2001
|
+
# test_run_manager.update_test_run returns early if api_test_case.metrics_data is an empty list.
|
|
2002
|
+
# Set it to None to ensure the test_case is added
|
|
2003
|
+
if api_test_case.metrics_data == [] and api_test_case.trace is None:
|
|
2004
|
+
api_test_case.metrics_data = None
|
|
1573
2005
|
|
|
1574
|
-
|
|
2006
|
+
# Duration & persist
|
|
2007
|
+
test_end_time = time.perf_counter()
|
|
2008
|
+
run_duration = test_end_time - test_start_time
|
|
2009
|
+
api_test_case.update_run_duration(run_duration)
|
|
2010
|
+
test_run_manager.update_test_run(api_test_case, test_case)
|
|
2011
|
+
|
|
2012
|
+
# Build results and de-duplicate against trace results
|
|
2013
|
+
main_result = create_test_result(api_test_case)
|
|
2014
|
+
trace_results = (
|
|
2015
|
+
extract_trace_test_results(trace_api)
|
|
2016
|
+
if trace_api is not None
|
|
2017
|
+
else []
|
|
2018
|
+
)
|
|
2019
|
+
unique_trace_results = filter_duplicate_results(
|
|
2020
|
+
main_result, trace_results
|
|
2021
|
+
)
|
|
2022
|
+
test_results.append(main_result)
|
|
2023
|
+
test_results.extend(unique_trace_results)
|
|
2024
|
+
update_pbar(progress, pbar_id)
|
|
2025
|
+
finally:
|
|
2026
|
+
pass
|
|
1575
2027
|
|
|
1576
2028
|
|
|
1577
2029
|
async def _a_execute_span_test_case(
|
|
@@ -2177,9 +2629,8 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2177
2629
|
|
|
2178
2630
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
2179
2631
|
async with semaphore:
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
)
|
|
2632
|
+
timeout = _per_task_timeout()
|
|
2633
|
+
return await _await_with_outer_deadline(coroutine, timeout=timeout)
|
|
2183
2634
|
|
|
2184
2635
|
def evaluate_test_cases(
|
|
2185
2636
|
progress: Optional[Progress] = None,
|
|
@@ -2328,16 +2779,25 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2328
2779
|
meta,
|
|
2329
2780
|
)
|
|
2330
2781
|
elif exc is not None:
|
|
2782
|
+
|
|
2783
|
+
show_trace = bool(
|
|
2784
|
+
get_settings().DEEPEVAL_LOG_STACK_TRACES
|
|
2785
|
+
)
|
|
2786
|
+
exc_info = (
|
|
2787
|
+
(
|
|
2788
|
+
type(exc),
|
|
2789
|
+
exc,
|
|
2790
|
+
getattr(exc, "__traceback__", None),
|
|
2791
|
+
)
|
|
2792
|
+
if show_trace
|
|
2793
|
+
else None
|
|
2794
|
+
)
|
|
2331
2795
|
logger.error(
|
|
2332
2796
|
"[deepeval] task ERROR %s after %.2fs meta=%r",
|
|
2333
2797
|
t.get_name(),
|
|
2334
2798
|
duration,
|
|
2335
2799
|
meta,
|
|
2336
|
-
exc_info=
|
|
2337
|
-
type(exc),
|
|
2338
|
-
exc,
|
|
2339
|
-
getattr(exc, "__traceback__", None),
|
|
2340
|
-
),
|
|
2800
|
+
exc_info=exc_info,
|
|
2341
2801
|
)
|
|
2342
2802
|
else:
|
|
2343
2803
|
logger.info(
|
|
@@ -2396,7 +2856,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
2396
2856
|
)
|
|
2397
2857
|
)
|
|
2398
2858
|
|
|
2399
|
-
except asyncio.TimeoutError:
|
|
2859
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
2400
2860
|
import traceback
|
|
2401
2861
|
|
|
2402
2862
|
pending = [t for t in created_tasks if not t.done()]
|
|
@@ -2609,9 +3069,9 @@ async def _a_evaluate_traces(
|
|
|
2609
3069
|
|
|
2610
3070
|
async def execute_evals_with_semaphore(func: Callable, *args, **kwargs):
|
|
2611
3071
|
async with semaphore:
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
timeout=
|
|
3072
|
+
timeout = _per_task_timeout()
|
|
3073
|
+
return await _await_with_outer_deadline(
|
|
3074
|
+
func, *args, timeout=timeout, **kwargs
|
|
2615
3075
|
)
|
|
2616
3076
|
|
|
2617
3077
|
eval_tasks = []
|
|
@@ -2661,7 +3121,7 @@ async def _a_evaluate_traces(
|
|
|
2661
3121
|
asyncio.gather(*eval_tasks),
|
|
2662
3122
|
timeout=_gather_timeout(),
|
|
2663
3123
|
)
|
|
2664
|
-
except asyncio.TimeoutError:
|
|
3124
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
2665
3125
|
for t in eval_tasks:
|
|
2666
3126
|
if not t.done():
|
|
2667
3127
|
t.cancel()
|
|
@@ -2689,9 +3149,9 @@ async def _evaluate_test_case_pairs(
|
|
|
2689
3149
|
|
|
2690
3150
|
async def execute_with_semaphore(func: Callable, *args, **kwargs):
|
|
2691
3151
|
async with semaphore:
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
timeout=
|
|
3152
|
+
timeout = _per_task_timeout()
|
|
3153
|
+
return await _await_with_outer_deadline(
|
|
3154
|
+
func, *args, timeout=timeout, **kwargs
|
|
2695
3155
|
)
|
|
2696
3156
|
|
|
2697
3157
|
tasks = []
|
|
@@ -2731,7 +3191,7 @@ async def _evaluate_test_case_pairs(
|
|
|
2731
3191
|
asyncio.gather(*tasks),
|
|
2732
3192
|
timeout=_gather_timeout(),
|
|
2733
3193
|
)
|
|
2734
|
-
except asyncio.TimeoutError:
|
|
3194
|
+
except (asyncio.TimeoutError, TimeoutError):
|
|
2735
3195
|
# Cancel any still-pending tasks and drain them
|
|
2736
3196
|
for t in tasks:
|
|
2737
3197
|
if not t.done():
|
|
@@ -2756,6 +3216,9 @@ def _execute_metric(
|
|
|
2756
3216
|
)
|
|
2757
3217
|
except MissingTestCaseParamsError as e:
|
|
2758
3218
|
if error_config.skip_on_missing_params:
|
|
3219
|
+
metric.skipped = True
|
|
3220
|
+
metric.error = None
|
|
3221
|
+
metric.success = None
|
|
2759
3222
|
return "skip"
|
|
2760
3223
|
else:
|
|
2761
3224
|
if error_config.ignore_errors:
|
|
@@ -2768,6 +3231,9 @@ def _execute_metric(
|
|
|
2768
3231
|
metric.measure(test_case)
|
|
2769
3232
|
except MissingTestCaseParamsError as e:
|
|
2770
3233
|
if error_config.skip_on_missing_params:
|
|
3234
|
+
metric.skipped = True
|
|
3235
|
+
metric.error = None
|
|
3236
|
+
metric.success = None
|
|
2771
3237
|
return "skip"
|
|
2772
3238
|
else:
|
|
2773
3239
|
if error_config.ignore_errors:
|