deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +104 -36
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +1662 -688
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.6.
|
|
1
|
+
__version__: str = "3.6.9"
|
deepeval/config/settings.py
CHANGED
|
@@ -30,6 +30,7 @@ from typing import Any, Dict, List, Optional, NamedTuple
|
|
|
30
30
|
from deepeval.config.utils import (
|
|
31
31
|
parse_bool,
|
|
32
32
|
coerce_to_list,
|
|
33
|
+
constrain_between,
|
|
33
34
|
dedupe_preserve_order,
|
|
34
35
|
)
|
|
35
36
|
from deepeval.constants import SUPPORTED_PROVIDER_SLUGS, slugify
|
|
@@ -336,6 +337,7 @@ class Settings(BaseSettings):
|
|
|
336
337
|
IGNORE_DEEPEVAL_ERRORS: Optional[bool] = None
|
|
337
338
|
SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
|
|
338
339
|
DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
|
|
340
|
+
DEEPEVAL_LOG_STACK_TRACES: Optional[bool] = None
|
|
339
341
|
ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
|
|
340
342
|
|
|
341
343
|
CONFIDENT_TRACE_FLUSH: Optional[bool] = None
|
|
@@ -355,11 +357,19 @@ class Settings(BaseSettings):
|
|
|
355
357
|
#
|
|
356
358
|
MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
|
|
357
359
|
MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
|
|
358
|
-
#
|
|
359
|
-
#
|
|
360
|
-
#
|
|
361
|
-
|
|
362
|
-
|
|
360
|
+
# DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE
|
|
361
|
+
# Per-attempt timeout (seconds) for provider calls used by the retry policy.
|
|
362
|
+
# This is an OVERRIDE setting. The effective value you should rely on at runtime is
|
|
363
|
+
# the computed property: DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS.
|
|
364
|
+
#
|
|
365
|
+
# If this is None or 0 the DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS is computed from either:
|
|
366
|
+
# - DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: slice the outer budget
|
|
367
|
+
# across attempts after subtracting expected backoff and a small safety buffer
|
|
368
|
+
# - the default outer budget (180s) if no outer override is set.
|
|
369
|
+
#
|
|
370
|
+
# Tip: Set this OR the outer override, but generally not both
|
|
371
|
+
DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE: Optional[confloat(gt=0)] = (
|
|
372
|
+
None
|
|
363
373
|
)
|
|
364
374
|
|
|
365
375
|
#
|
|
@@ -373,76 +383,115 @@ class Settings(BaseSettings):
|
|
|
373
383
|
#
|
|
374
384
|
DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
|
|
375
385
|
DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
|
|
376
|
-
#
|
|
377
|
-
#
|
|
378
|
-
#
|
|
379
|
-
# attempts * per_attempt_timeout +
|
|
380
|
-
#
|
|
381
|
-
#
|
|
382
|
-
#
|
|
386
|
+
# DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
387
|
+
# Outer time budget (seconds) for a single metric/test-case, including retries and backoff.
|
|
388
|
+
# This is an OVERRIDE setting. If None or 0 the DEEPEVAL_PER_TASK_TIMEOUT_SECONDS field is computed:
|
|
389
|
+
# attempts * per_attempt_timeout + expected_backoff + 1s safety
|
|
390
|
+
# (When neither override is set 180s is used.)
|
|
391
|
+
#
|
|
392
|
+
# If > 0, we use the value exactly and log a warning if it is likely too small
|
|
393
|
+
# to accommodate the configured attempts/backoff.
|
|
383
394
|
#
|
|
384
|
-
#
|
|
385
|
-
#
|
|
386
|
-
#
|
|
387
|
-
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[
|
|
395
|
+
# usage:
|
|
396
|
+
# - set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE along with DEEPEVAL_RETRY_MAX_ATTEMPTS, or
|
|
397
|
+
# - set DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE alone.
|
|
398
|
+
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[confloat(ge=0)] = None
|
|
388
399
|
|
|
389
400
|
# Buffer time for gathering results from all tasks, added to the longest task duration
|
|
390
401
|
# Increase if many tasks are running concurrently
|
|
391
|
-
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) =
|
|
402
|
+
# DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = (
|
|
403
|
+
# 30 # 15s seemed like not enough. we may make this computed later.
|
|
404
|
+
# )
|
|
405
|
+
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE: Optional[confloat(ge=0)] = (
|
|
406
|
+
None
|
|
407
|
+
)
|
|
392
408
|
|
|
393
409
|
###################
|
|
394
410
|
# Computed Fields #
|
|
395
411
|
###################
|
|
396
412
|
|
|
397
|
-
def _calc_auto_outer_timeout(self) ->
|
|
413
|
+
def _calc_auto_outer_timeout(self) -> float:
|
|
398
414
|
"""Compute outer budget from per-attempt timeout + retries/backoff.
|
|
399
415
|
Never reference the computed property itself here.
|
|
400
416
|
"""
|
|
401
417
|
attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
|
|
402
|
-
timeout_seconds = float(
|
|
418
|
+
timeout_seconds = float(
|
|
419
|
+
self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE or 0
|
|
420
|
+
)
|
|
403
421
|
if timeout_seconds <= 0:
|
|
404
422
|
# No per-attempt timeout set -> default outer budget
|
|
405
423
|
return 180
|
|
406
424
|
|
|
407
|
-
|
|
408
|
-
cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
|
|
409
|
-
cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
|
|
410
|
-
base = float(self.DEEPEVAL_RETRY_EXP_BASE)
|
|
411
|
-
jitter = float(self.DEEPEVAL_RETRY_JITTER)
|
|
412
|
-
|
|
413
|
-
backoff = 0.0
|
|
414
|
-
for _ in range(sleeps):
|
|
415
|
-
backoff += min(cap, cur)
|
|
416
|
-
cur *= base
|
|
417
|
-
backoff += sleeps * (jitter / 2.0) # expected jitter
|
|
418
|
-
|
|
425
|
+
backoff = self._expected_backoff(attempts)
|
|
419
426
|
safety_overhead = 1.0
|
|
420
|
-
return
|
|
427
|
+
return float(
|
|
421
428
|
math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
|
|
422
429
|
)
|
|
423
430
|
|
|
424
431
|
@computed_field
|
|
425
432
|
@property
|
|
426
|
-
def
|
|
433
|
+
def DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS(self) -> float:
|
|
434
|
+
over = self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE
|
|
435
|
+
if over is not None and float(over) > 0:
|
|
436
|
+
return float(over)
|
|
437
|
+
|
|
438
|
+
attempts = int(self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1)
|
|
439
|
+
outer_over = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
440
|
+
|
|
441
|
+
# If the user set an outer override, slice it up
|
|
442
|
+
if outer_over and float(outer_over) > 0 and attempts > 0:
|
|
443
|
+
backoff = self._expected_backoff(attempts)
|
|
444
|
+
safety = 1.0
|
|
445
|
+
usable = max(0.0, float(outer_over) - backoff - safety)
|
|
446
|
+
return 0.0 if usable <= 0 else (usable / attempts)
|
|
447
|
+
|
|
448
|
+
# NEW: when neither override is set, derive from the default outer (180s)
|
|
449
|
+
default_outer = 180.0
|
|
450
|
+
backoff = self._expected_backoff(attempts)
|
|
451
|
+
safety = 1.0
|
|
452
|
+
usable = max(0.0, default_outer - backoff - safety)
|
|
453
|
+
# Keep per-attempt sensible (cap to at least 1s)
|
|
454
|
+
return 0.0 if usable <= 0 else max(1.0, usable / attempts)
|
|
455
|
+
|
|
456
|
+
@computed_field
|
|
457
|
+
@property
|
|
458
|
+
def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> float:
|
|
427
459
|
"""If OVERRIDE is set (nonzero), return it; else return the derived budget."""
|
|
428
460
|
outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
429
461
|
if outer not in (None, 0):
|
|
430
462
|
# Warn if user-provided outer is likely to truncate retries
|
|
431
463
|
if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
|
|
432
464
|
min_needed = self._calc_auto_outer_timeout()
|
|
433
|
-
if
|
|
465
|
+
if float(outer) < min_needed:
|
|
434
466
|
if self.DEEPEVAL_VERBOSE_MODE:
|
|
435
467
|
logger.warning(
|
|
436
468
|
"Metric timeout (outer=%ss) is less than attempts × per-attempt "
|
|
437
469
|
"timeout + backoff (≈%ss). Retries may be cut short.",
|
|
438
|
-
|
|
470
|
+
float(outer),
|
|
439
471
|
min_needed,
|
|
440
472
|
)
|
|
441
|
-
return
|
|
473
|
+
return float(outer)
|
|
442
474
|
|
|
443
475
|
# Auto mode
|
|
444
476
|
return self._calc_auto_outer_timeout()
|
|
445
477
|
|
|
478
|
+
@computed_field
|
|
479
|
+
@property
|
|
480
|
+
def DEEPEVAL_TASK_GATHER_BUFFER_SECONDS(self) -> float:
|
|
481
|
+
"""
|
|
482
|
+
Buffer time we add to the longest task’s duration to allow gather/drain
|
|
483
|
+
to complete. If an override is provided, use it; otherwise derive a
|
|
484
|
+
sensible default from the task-level budget:
|
|
485
|
+
buffer = constrain_between(0.15 * DEEPEVAL_PER_TASK_TIMEOUT_SECONDS, 10, 60)
|
|
486
|
+
"""
|
|
487
|
+
over = self.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE
|
|
488
|
+
if over is not None and float(over) >= 0:
|
|
489
|
+
return float(over)
|
|
490
|
+
|
|
491
|
+
outer = float(self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS or 0.0)
|
|
492
|
+
base = 0.15 * outer
|
|
493
|
+
return constrain_between(base, 10.0, 60.0)
|
|
494
|
+
|
|
446
495
|
##############
|
|
447
496
|
# Validators #
|
|
448
497
|
##############
|
|
@@ -810,6 +859,25 @@ class Settings(BaseSettings):
|
|
|
810
859
|
ctx.switch_model_provider(target)
|
|
811
860
|
return ctx.result
|
|
812
861
|
|
|
862
|
+
def _expected_backoff(self, attempts: int) -> float:
|
|
863
|
+
"""Sum of expected sleeps for (attempts-1) retries, including jitter expectation."""
|
|
864
|
+
sleeps = max(0, attempts - 1)
|
|
865
|
+
cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
|
|
866
|
+
cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
|
|
867
|
+
base = float(self.DEEPEVAL_RETRY_EXP_BASE)
|
|
868
|
+
jitter = float(self.DEEPEVAL_RETRY_JITTER)
|
|
869
|
+
|
|
870
|
+
backoff = 0.0
|
|
871
|
+
for _ in range(sleeps):
|
|
872
|
+
backoff += min(cap, cur)
|
|
873
|
+
cur *= base
|
|
874
|
+
backoff += sleeps * (jitter / 2.0) # expected jitter
|
|
875
|
+
return backoff
|
|
876
|
+
|
|
877
|
+
def _constrain_between(self, value: float, lo: float, hi: float) -> float:
|
|
878
|
+
"""Return value constrained to the inclusive range [lo, hi]."""
|
|
879
|
+
return min(max(value, lo), hi)
|
|
880
|
+
|
|
813
881
|
|
|
814
882
|
_settings_singleton: Optional[Settings] = None
|
|
815
883
|
|
deepeval/config/utils.py
CHANGED
|
@@ -137,3 +137,8 @@ def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
|
|
137
137
|
seen.add(x)
|
|
138
138
|
out.append(x)
|
|
139
139
|
return out
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def constrain_between(value: float, lo: float, hi: float) -> float:
|
|
143
|
+
"""Return value constrained to the inclusive range [lo, hi]."""
|
|
144
|
+
return min(max(value, lo), hi)
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -951,6 +951,8 @@ class EvaluationDataset:
|
|
|
951
951
|
context=golden.context,
|
|
952
952
|
name=golden.name,
|
|
953
953
|
comments=golden.comments,
|
|
954
|
+
additional_metadata=golden.additional_metadata,
|
|
955
|
+
custom_column_key_values=golden.custom_column_key_values,
|
|
954
956
|
)
|
|
955
957
|
for golden in self.goldens
|
|
956
958
|
]
|
|
@@ -965,6 +967,10 @@ class EvaluationDataset:
|
|
|
965
967
|
name=golden.name,
|
|
966
968
|
comments=golden.comments,
|
|
967
969
|
source_file=golden.source_file,
|
|
970
|
+
tools_called=golden.tools_called,
|
|
971
|
+
expected_tools=golden.expected_tools,
|
|
972
|
+
additional_metadata=golden.additional_metadata,
|
|
973
|
+
custom_column_key_values=golden.custom_column_key_values,
|
|
968
974
|
)
|
|
969
975
|
for golden in self.goldens
|
|
970
976
|
]
|
|
@@ -995,36 +1001,68 @@ class EvaluationDataset:
|
|
|
995
1001
|
if file_type == "json":
|
|
996
1002
|
with open(full_file_path, "w", encoding="utf-8") as file:
|
|
997
1003
|
if self._multi_turn:
|
|
998
|
-
json_data = [
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1004
|
+
json_data = []
|
|
1005
|
+
for golden in goldens:
|
|
1006
|
+
# Serialize turns as structured list of dicts
|
|
1007
|
+
turns_list = (
|
|
1008
|
+
json.loads(format_turns(golden.turns))
|
|
1009
|
+
if golden.turns
|
|
1010
|
+
else None
|
|
1011
|
+
)
|
|
1012
|
+
json_data.append(
|
|
1013
|
+
{
|
|
1014
|
+
"scenario": golden.scenario,
|
|
1015
|
+
"turns": turns_list,
|
|
1016
|
+
"expected_outcome": golden.expected_outcome,
|
|
1017
|
+
"user_description": golden.user_description,
|
|
1018
|
+
"context": golden.context,
|
|
1019
|
+
"name": golden.name,
|
|
1020
|
+
"comments": golden.comments,
|
|
1021
|
+
"additional_metadata": golden.additional_metadata,
|
|
1022
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1023
|
+
}
|
|
1024
|
+
)
|
|
1014
1025
|
else:
|
|
1015
|
-
json_data = [
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1026
|
+
json_data = []
|
|
1027
|
+
for golden in goldens:
|
|
1028
|
+
# Convert ToolCall lists to list[dict]
|
|
1029
|
+
def _dump_tools(tools):
|
|
1030
|
+
if not tools:
|
|
1031
|
+
return None
|
|
1032
|
+
dumped = []
|
|
1033
|
+
for t in tools:
|
|
1034
|
+
if hasattr(t, "model_dump"):
|
|
1035
|
+
dumped.append(
|
|
1036
|
+
t.model_dump(
|
|
1037
|
+
by_alias=True, exclude_none=True
|
|
1038
|
+
)
|
|
1039
|
+
)
|
|
1040
|
+
elif hasattr(t, "dict"):
|
|
1041
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1042
|
+
else:
|
|
1043
|
+
dumped.append(t)
|
|
1044
|
+
return dumped if len(dumped) > 0 else None
|
|
1045
|
+
|
|
1046
|
+
json_data.append(
|
|
1047
|
+
{
|
|
1048
|
+
"input": golden.input,
|
|
1049
|
+
"actual_output": golden.actual_output,
|
|
1050
|
+
"expected_output": golden.expected_output,
|
|
1051
|
+
"retrieval_context": golden.retrieval_context,
|
|
1052
|
+
"context": golden.context,
|
|
1053
|
+
"name": golden.name,
|
|
1054
|
+
"comments": golden.comments,
|
|
1055
|
+
"source_file": golden.source_file,
|
|
1056
|
+
"tools_called": _dump_tools(
|
|
1057
|
+
golden.tools_called
|
|
1058
|
+
),
|
|
1059
|
+
"expected_tools": _dump_tools(
|
|
1060
|
+
golden.expected_tools
|
|
1061
|
+
),
|
|
1062
|
+
"additional_metadata": golden.additional_metadata,
|
|
1063
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1064
|
+
}
|
|
1065
|
+
)
|
|
1028
1066
|
json.dump(json_data, file, indent=4, ensure_ascii=False)
|
|
1029
1067
|
elif file_type == "csv":
|
|
1030
1068
|
with open(
|
|
@@ -1041,6 +1079,8 @@ class EvaluationDataset:
|
|
|
1041
1079
|
"context",
|
|
1042
1080
|
"name",
|
|
1043
1081
|
"comments",
|
|
1082
|
+
"additional_metadata",
|
|
1083
|
+
"custom_column_key_values",
|
|
1044
1084
|
]
|
|
1045
1085
|
)
|
|
1046
1086
|
for golden in goldens:
|
|
@@ -1054,6 +1094,21 @@ class EvaluationDataset:
|
|
|
1054
1094
|
if golden.turns is not None
|
|
1055
1095
|
else None
|
|
1056
1096
|
)
|
|
1097
|
+
additional_metadata = (
|
|
1098
|
+
json.dumps(
|
|
1099
|
+
golden.additional_metadata, ensure_ascii=False
|
|
1100
|
+
)
|
|
1101
|
+
if golden.additional_metadata is not None
|
|
1102
|
+
else None
|
|
1103
|
+
)
|
|
1104
|
+
custom_cols = (
|
|
1105
|
+
json.dumps(
|
|
1106
|
+
golden.custom_column_key_values,
|
|
1107
|
+
ensure_ascii=False,
|
|
1108
|
+
)
|
|
1109
|
+
if golden.custom_column_key_values
|
|
1110
|
+
else None
|
|
1111
|
+
)
|
|
1057
1112
|
writer.writerow(
|
|
1058
1113
|
[
|
|
1059
1114
|
golden.scenario,
|
|
@@ -1063,6 +1118,8 @@ class EvaluationDataset:
|
|
|
1063
1118
|
context,
|
|
1064
1119
|
golden.name,
|
|
1065
1120
|
golden.comments,
|
|
1121
|
+
additional_metadata,
|
|
1122
|
+
custom_cols,
|
|
1066
1123
|
]
|
|
1067
1124
|
)
|
|
1068
1125
|
else:
|
|
@@ -1076,6 +1133,10 @@ class EvaluationDataset:
|
|
|
1076
1133
|
"name",
|
|
1077
1134
|
"comments",
|
|
1078
1135
|
"source_file",
|
|
1136
|
+
"tools_called",
|
|
1137
|
+
"expected_tools",
|
|
1138
|
+
"additional_metadata",
|
|
1139
|
+
"custom_column_key_values",
|
|
1079
1140
|
]
|
|
1080
1141
|
)
|
|
1081
1142
|
for golden in goldens:
|
|
@@ -1089,6 +1150,42 @@ class EvaluationDataset:
|
|
|
1089
1150
|
if golden.context is not None
|
|
1090
1151
|
else None
|
|
1091
1152
|
)
|
|
1153
|
+
|
|
1154
|
+
# Dump tools as JSON strings for CSV
|
|
1155
|
+
def _dump_tools_csv(tools):
|
|
1156
|
+
if not tools:
|
|
1157
|
+
return None
|
|
1158
|
+
dumped = []
|
|
1159
|
+
for t in tools:
|
|
1160
|
+
if hasattr(t, "model_dump"):
|
|
1161
|
+
dumped.append(
|
|
1162
|
+
t.model_dump(
|
|
1163
|
+
by_alias=True, exclude_none=True
|
|
1164
|
+
)
|
|
1165
|
+
)
|
|
1166
|
+
elif hasattr(t, "dict"):
|
|
1167
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1168
|
+
else:
|
|
1169
|
+
dumped.append(t)
|
|
1170
|
+
return json.dumps(dumped, ensure_ascii=False)
|
|
1171
|
+
|
|
1172
|
+
tools_called = _dump_tools_csv(golden.tools_called)
|
|
1173
|
+
expected_tools = _dump_tools_csv(golden.expected_tools)
|
|
1174
|
+
additional_metadata = (
|
|
1175
|
+
json.dumps(
|
|
1176
|
+
golden.additional_metadata, ensure_ascii=False
|
|
1177
|
+
)
|
|
1178
|
+
if golden.additional_metadata is not None
|
|
1179
|
+
else None
|
|
1180
|
+
)
|
|
1181
|
+
custom_cols = (
|
|
1182
|
+
json.dumps(
|
|
1183
|
+
golden.custom_column_key_values,
|
|
1184
|
+
ensure_ascii=False,
|
|
1185
|
+
)
|
|
1186
|
+
if golden.custom_column_key_values
|
|
1187
|
+
else None
|
|
1188
|
+
)
|
|
1092
1189
|
writer.writerow(
|
|
1093
1190
|
[
|
|
1094
1191
|
golden.input,
|
|
@@ -1099,6 +1196,10 @@ class EvaluationDataset:
|
|
|
1099
1196
|
golden.name,
|
|
1100
1197
|
golden.comments,
|
|
1101
1198
|
golden.source_file,
|
|
1199
|
+
tools_called,
|
|
1200
|
+
expected_tools,
|
|
1201
|
+
additional_metadata,
|
|
1202
|
+
custom_cols,
|
|
1102
1203
|
]
|
|
1103
1204
|
)
|
|
1104
1205
|
elif file_type == "jsonl":
|
|
@@ -1106,7 +1207,9 @@ class EvaluationDataset:
|
|
|
1106
1207
|
for golden in goldens:
|
|
1107
1208
|
if self._multi_turn:
|
|
1108
1209
|
turns = (
|
|
1109
|
-
format_turns(golden.turns)
|
|
1210
|
+
json.loads(format_turns(golden.turns))
|
|
1211
|
+
if golden.turns
|
|
1212
|
+
else None
|
|
1110
1213
|
)
|
|
1111
1214
|
record = {
|
|
1112
1215
|
"scenario": golden.scenario,
|
|
@@ -1114,6 +1217,10 @@ class EvaluationDataset:
|
|
|
1114
1217
|
"expected_outcome": golden.expected_outcome,
|
|
1115
1218
|
"user_description": golden.user_description,
|
|
1116
1219
|
"context": golden.context,
|
|
1220
|
+
"name": golden.name,
|
|
1221
|
+
"comments": golden.comments,
|
|
1222
|
+
"additional_metadata": golden.additional_metadata,
|
|
1223
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1117
1224
|
}
|
|
1118
1225
|
else:
|
|
1119
1226
|
retrieval_context = (
|
|
@@ -1126,12 +1233,37 @@ class EvaluationDataset:
|
|
|
1126
1233
|
if golden.context is not None
|
|
1127
1234
|
else None
|
|
1128
1235
|
)
|
|
1236
|
+
|
|
1237
|
+
# Convert ToolCall lists to list[dict]
|
|
1238
|
+
def _dump_tools(tools):
|
|
1239
|
+
if not tools:
|
|
1240
|
+
return None
|
|
1241
|
+
dumped = []
|
|
1242
|
+
for t in tools:
|
|
1243
|
+
if hasattr(t, "model_dump"):
|
|
1244
|
+
dumped.append(
|
|
1245
|
+
t.model_dump(
|
|
1246
|
+
by_alias=True, exclude_none=True
|
|
1247
|
+
)
|
|
1248
|
+
)
|
|
1249
|
+
elif hasattr(t, "dict"):
|
|
1250
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1251
|
+
else:
|
|
1252
|
+
dumped.append(t)
|
|
1253
|
+
return dumped if len(dumped) > 0 else None
|
|
1254
|
+
|
|
1129
1255
|
record = {
|
|
1130
1256
|
"input": golden.input,
|
|
1131
1257
|
"actual_output": golden.actual_output,
|
|
1132
1258
|
"expected_output": golden.expected_output,
|
|
1133
1259
|
"retrieval_context": retrieval_context,
|
|
1134
1260
|
"context": context,
|
|
1261
|
+
"tools_called": _dump_tools(golden.tools_called),
|
|
1262
|
+
"expected_tools": _dump_tools(
|
|
1263
|
+
golden.expected_tools
|
|
1264
|
+
),
|
|
1265
|
+
"additional_metadata": golden.additional_metadata,
|
|
1266
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1135
1267
|
}
|
|
1136
1268
|
|
|
1137
1269
|
file.write(json.dumps(record, ensure_ascii=False) + "\n")
|
deepeval/dataset/utils.py
CHANGED
|
@@ -111,12 +111,36 @@ def trimAndLoadJson(input_string: str) -> Any:
|
|
|
111
111
|
def format_turns(turns: List[Turn]) -> str:
|
|
112
112
|
res = []
|
|
113
113
|
for turn in turns:
|
|
114
|
+
# Safely convert nested Pydantic models (ToolCall/MCP calls) to dicts
|
|
115
|
+
def _dump_list(models):
|
|
116
|
+
if not models:
|
|
117
|
+
return None
|
|
118
|
+
dumped = []
|
|
119
|
+
for m in models:
|
|
120
|
+
if hasattr(m, "model_dump"):
|
|
121
|
+
dumped.append(
|
|
122
|
+
m.model_dump(by_alias=True, exclude_none=True)
|
|
123
|
+
)
|
|
124
|
+
elif hasattr(m, "dict"):
|
|
125
|
+
dumped.append(m.dict(exclude_none=True))
|
|
126
|
+
else:
|
|
127
|
+
dumped.append(m)
|
|
128
|
+
return dumped if len(dumped) > 0 else None
|
|
129
|
+
|
|
114
130
|
cur_turn = {
|
|
115
131
|
"role": turn.role,
|
|
116
132
|
"content": turn.content,
|
|
133
|
+
"user_id": turn.user_id if turn.user_id is not None else None,
|
|
117
134
|
"retrieval_context": (
|
|
118
135
|
turn.retrieval_context if turn.retrieval_context else None
|
|
119
136
|
),
|
|
137
|
+
"tools_called": _dump_list(turn.tools_called),
|
|
138
|
+
"mcp_tools_called": _dump_list(turn.mcp_tools_called),
|
|
139
|
+
"mcp_resources_called": _dump_list(turn.mcp_resources_called),
|
|
140
|
+
"mcp_prompts_called": _dump_list(turn.mcp_prompts_called),
|
|
141
|
+
"additional_metadata": (
|
|
142
|
+
turn.additional_metadata if turn.additional_metadata else None
|
|
143
|
+
),
|
|
120
144
|
}
|
|
121
145
|
res.append(cur_turn)
|
|
122
146
|
try:
|
|
@@ -125,11 +149,17 @@ def format_turns(turns: List[Turn]) -> str:
|
|
|
125
149
|
raise ValueError(f"Error serializing turns: {e}")
|
|
126
150
|
|
|
127
151
|
|
|
128
|
-
def parse_turns(turns_str:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
152
|
+
def parse_turns(turns_str: Any) -> List[Turn]:
|
|
153
|
+
# Accept either a JSON string or a Python list
|
|
154
|
+
if isinstance(turns_str, str):
|
|
155
|
+
try:
|
|
156
|
+
parsed = json.loads(turns_str)
|
|
157
|
+
except json.JSONDecodeError as e:
|
|
158
|
+
raise ValueError(f"Invalid JSON: {e}")
|
|
159
|
+
elif isinstance(turns_str, list):
|
|
160
|
+
parsed = turns_str
|
|
161
|
+
else:
|
|
162
|
+
raise TypeError("Expected a JSON string or a list of turns.")
|
|
133
163
|
|
|
134
164
|
if not isinstance(parsed, list):
|
|
135
165
|
raise TypeError("Expected a list of turns.")
|
|
@@ -145,15 +175,13 @@ def parse_turns(turns_str: str) -> List[Turn]:
|
|
|
145
175
|
if "content" not in turn or not isinstance(turn["content"], str):
|
|
146
176
|
raise ValueError(f"Turn at index {i} is missing a valid 'content'.")
|
|
147
177
|
|
|
148
|
-
|
|
178
|
+
try:
|
|
179
|
+
# Pydantic v2
|
|
180
|
+
res.append(Turn.model_validate(turn))
|
|
181
|
+
except AttributeError:
|
|
182
|
+
# Pydantic v1 fallback
|
|
183
|
+
res.append(Turn.parse_obj(turn))
|
|
149
184
|
|
|
150
|
-
res.append(
|
|
151
|
-
Turn(
|
|
152
|
-
role=turn["role"],
|
|
153
|
-
content=turn["content"],
|
|
154
|
-
retrieval_context=retrieval_context,
|
|
155
|
-
)
|
|
156
|
-
)
|
|
157
185
|
return res
|
|
158
186
|
|
|
159
187
|
|
deepeval/errors.py
CHANGED
|
@@ -1,6 +1,24 @@
|
|
|
1
|
-
class
|
|
1
|
+
class DeepEvalError(Exception):
|
|
2
|
+
"""Base class for framework-originated errors.
|
|
3
|
+
If raised and not handled, it will abort the current operation.
|
|
4
|
+
We may also stringify instances of this class and attach them to traces or spans to surface
|
|
5
|
+
non-fatal diagnostics while allowing the run to continue.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UserAppError(Exception):
|
|
10
|
+
"""Represents exceptions thrown by user LLM apps/tools.
|
|
11
|
+
We record these on traces or spans and keep the overall evaluation run alive.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MissingTestCaseParamsError(DeepEvalError):
|
|
16
|
+
"""Required test case fields are missing."""
|
|
17
|
+
|
|
2
18
|
pass
|
|
3
19
|
|
|
4
20
|
|
|
5
|
-
class MismatchedTestCaseInputsError(
|
|
21
|
+
class MismatchedTestCaseInputsError(DeepEvalError):
|
|
22
|
+
"""Inputs provided to a metric or test case are inconsistent or invalid."""
|
|
23
|
+
|
|
6
24
|
pass
|