deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.7"
1
+ __version__: str = "3.6.9"
@@ -30,6 +30,7 @@ from typing import Any, Dict, List, Optional, NamedTuple
30
30
  from deepeval.config.utils import (
31
31
  parse_bool,
32
32
  coerce_to_list,
33
+ constrain_between,
33
34
  dedupe_preserve_order,
34
35
  )
35
36
  from deepeval.constants import SUPPORTED_PROVIDER_SLUGS, slugify
@@ -336,6 +337,7 @@ class Settings(BaseSettings):
336
337
  IGNORE_DEEPEVAL_ERRORS: Optional[bool] = None
337
338
  SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
338
339
  DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
340
+ DEEPEVAL_LOG_STACK_TRACES: Optional[bool] = None
339
341
  ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
340
342
 
341
343
  CONFIDENT_TRACE_FLUSH: Optional[bool] = None
@@ -355,11 +357,19 @@ class Settings(BaseSettings):
355
357
  #
356
358
  MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
357
359
  MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
358
- # DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
359
- # This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
360
- # If you leave this at 0/None, the computed outer budget defaults to 180s.
361
- DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
362
- None # per-attempt timeout. Set 0/None to disable
360
+ # DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE
361
+ # Per-attempt timeout (seconds) for provider calls used by the retry policy.
362
+ # This is an OVERRIDE setting. The effective value you should rely on at runtime is
363
+ # the computed property: DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS.
364
+ #
365
+ # If this is None or 0 the DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS is computed from either:
366
+ # - DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: slice the outer budget
367
+ # across attempts after subtracting expected backoff and a small safety buffer
368
+ # - the default outer budget (180s) if no outer override is set.
369
+ #
370
+ # Tip: Set this OR the outer override, but generally not both
371
+ DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE: Optional[confloat(gt=0)] = (
372
+ None
363
373
  )
364
374
 
365
375
  #
@@ -373,76 +383,115 @@ class Settings(BaseSettings):
373
383
  #
374
384
  DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
375
385
  DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
376
- # DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
377
- # It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
378
- # - OVERRIDE = None or 0 -> auto compute as:
379
- # attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
380
- # (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
381
- # - OVERRIDE > 0 -> use that exact value. A warning is logged if it is likely too small
382
- # to permit the configured attempts/backoff.
386
+ # DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
387
+ # Outer time budget (seconds) for a single metric/test-case, including retries and backoff.
388
+ # This is an OVERRIDE setting. If None or 0 the DEEPEVAL_PER_TASK_TIMEOUT_SECONDS field is computed:
389
+ # attempts * per_attempt_timeout + expected_backoff + 1s safety
390
+ # (When neither override is set 180s is used.)
391
+ #
392
+ # If > 0, we use the value exactly and log a warning if it is likely too small
393
+ # to accommodate the configured attempts/backoff.
383
394
  #
384
- # Tip:
385
- # Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
386
- # Leave the outer budget on auto unless you have very strict SLAs.
387
- DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
395
+ # usage:
396
+ # - set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE along with DEEPEVAL_RETRY_MAX_ATTEMPTS, or
397
+ # - set DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE alone.
398
+ DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[confloat(ge=0)] = None
388
399
 
389
400
  # Buffer time for gathering results from all tasks, added to the longest task duration
390
401
  # Increase if many tasks are running concurrently
391
- DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
402
+ # DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = (
403
+ # 30 # 15s seemed like not enough. we may make this computed later.
404
+ # )
405
+ DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE: Optional[confloat(ge=0)] = (
406
+ None
407
+ )
392
408
 
393
409
  ###################
394
410
  # Computed Fields #
395
411
  ###################
396
412
 
397
- def _calc_auto_outer_timeout(self) -> int:
413
+ def _calc_auto_outer_timeout(self) -> float:
398
414
  """Compute outer budget from per-attempt timeout + retries/backoff.
399
415
  Never reference the computed property itself here.
400
416
  """
401
417
  attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
402
- timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
418
+ timeout_seconds = float(
419
+ self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE or 0
420
+ )
403
421
  if timeout_seconds <= 0:
404
422
  # No per-attempt timeout set -> default outer budget
405
423
  return 180
406
424
 
407
- sleeps = max(0, attempts - 1)
408
- cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
409
- cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
410
- base = float(self.DEEPEVAL_RETRY_EXP_BASE)
411
- jitter = float(self.DEEPEVAL_RETRY_JITTER)
412
-
413
- backoff = 0.0
414
- for _ in range(sleeps):
415
- backoff += min(cap, cur)
416
- cur *= base
417
- backoff += sleeps * (jitter / 2.0) # expected jitter
418
-
425
+ backoff = self._expected_backoff(attempts)
419
426
  safety_overhead = 1.0
420
- return int(
427
+ return float(
421
428
  math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
422
429
  )
423
430
 
424
431
  @computed_field
425
432
  @property
426
- def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
433
+ def DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS(self) -> float:
434
+ over = self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE
435
+ if over is not None and float(over) > 0:
436
+ return float(over)
437
+
438
+ attempts = int(self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1)
439
+ outer_over = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
440
+
441
+ # If the user set an outer override, slice it up
442
+ if outer_over and float(outer_over) > 0 and attempts > 0:
443
+ backoff = self._expected_backoff(attempts)
444
+ safety = 1.0
445
+ usable = max(0.0, float(outer_over) - backoff - safety)
446
+ return 0.0 if usable <= 0 else (usable / attempts)
447
+
448
+ # NEW: when neither override is set, derive from the default outer (180s)
449
+ default_outer = 180.0
450
+ backoff = self._expected_backoff(attempts)
451
+ safety = 1.0
452
+ usable = max(0.0, default_outer - backoff - safety)
453
+ # Keep per-attempt sensible (cap to at least 1s)
454
+ return 0.0 if usable <= 0 else max(1.0, usable / attempts)
455
+
456
+ @computed_field
457
+ @property
458
+ def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> float:
427
459
  """If OVERRIDE is set (nonzero), return it; else return the derived budget."""
428
460
  outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
429
461
  if outer not in (None, 0):
430
462
  # Warn if user-provided outer is likely to truncate retries
431
463
  if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
432
464
  min_needed = self._calc_auto_outer_timeout()
433
- if int(outer) < min_needed:
465
+ if float(outer) < min_needed:
434
466
  if self.DEEPEVAL_VERBOSE_MODE:
435
467
  logger.warning(
436
468
  "Metric timeout (outer=%ss) is less than attempts × per-attempt "
437
469
  "timeout + backoff (≈%ss). Retries may be cut short.",
438
- int(outer),
470
+ float(outer),
439
471
  min_needed,
440
472
  )
441
- return int(outer)
473
+ return float(outer)
442
474
 
443
475
  # Auto mode
444
476
  return self._calc_auto_outer_timeout()
445
477
 
478
+ @computed_field
479
+ @property
480
+ def DEEPEVAL_TASK_GATHER_BUFFER_SECONDS(self) -> float:
481
+ """
482
+ Buffer time we add to the longest task’s duration to allow gather/drain
483
+ to complete. If an override is provided, use it; otherwise derive a
484
+ sensible default from the task-level budget:
485
+ buffer = constrain_between(0.15 * DEEPEVAL_PER_TASK_TIMEOUT_SECONDS, 10, 60)
486
+ """
487
+ over = self.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE
488
+ if over is not None and float(over) >= 0:
489
+ return float(over)
490
+
491
+ outer = float(self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS or 0.0)
492
+ base = 0.15 * outer
493
+ return constrain_between(base, 10.0, 60.0)
494
+
446
495
  ##############
447
496
  # Validators #
448
497
  ##############
@@ -810,6 +859,25 @@ class Settings(BaseSettings):
810
859
  ctx.switch_model_provider(target)
811
860
  return ctx.result
812
861
 
862
+ def _expected_backoff(self, attempts: int) -> float:
863
+ """Sum of expected sleeps for (attempts-1) retries, including jitter expectation."""
864
+ sleeps = max(0, attempts - 1)
865
+ cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
866
+ cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
867
+ base = float(self.DEEPEVAL_RETRY_EXP_BASE)
868
+ jitter = float(self.DEEPEVAL_RETRY_JITTER)
869
+
870
+ backoff = 0.0
871
+ for _ in range(sleeps):
872
+ backoff += min(cap, cur)
873
+ cur *= base
874
+ backoff += sleeps * (jitter / 2.0) # expected jitter
875
+ return backoff
876
+
877
+ def _constrain_between(self, value: float, lo: float, hi: float) -> float:
878
+ """Return value constrained to the inclusive range [lo, hi]."""
879
+ return min(max(value, lo), hi)
880
+
813
881
 
814
882
  _settings_singleton: Optional[Settings] = None
815
883
 
deepeval/config/utils.py CHANGED
@@ -137,3 +137,8 @@ def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
137
137
  seen.add(x)
138
138
  out.append(x)
139
139
  return out
140
+
141
+
142
+ def constrain_between(value: float, lo: float, hi: float) -> float:
143
+ """Return value constrained to the inclusive range [lo, hi]."""
144
+ return min(max(value, lo), hi)
@@ -951,6 +951,8 @@ class EvaluationDataset:
951
951
  context=golden.context,
952
952
  name=golden.name,
953
953
  comments=golden.comments,
954
+ additional_metadata=golden.additional_metadata,
955
+ custom_column_key_values=golden.custom_column_key_values,
954
956
  )
955
957
  for golden in self.goldens
956
958
  ]
@@ -965,6 +967,10 @@ class EvaluationDataset:
965
967
  name=golden.name,
966
968
  comments=golden.comments,
967
969
  source_file=golden.source_file,
970
+ tools_called=golden.tools_called,
971
+ expected_tools=golden.expected_tools,
972
+ additional_metadata=golden.additional_metadata,
973
+ custom_column_key_values=golden.custom_column_key_values,
968
974
  )
969
975
  for golden in self.goldens
970
976
  ]
@@ -995,36 +1001,68 @@ class EvaluationDataset:
995
1001
  if file_type == "json":
996
1002
  with open(full_file_path, "w", encoding="utf-8") as file:
997
1003
  if self._multi_turn:
998
- json_data = [
999
- {
1000
- "scenario": golden.scenario,
1001
- "turns": (
1002
- format_turns(golden.turns)
1003
- if golden.turns
1004
- else None
1005
- ),
1006
- "expected_outcome": golden.expected_outcome,
1007
- "user_description": golden.user_description,
1008
- "context": golden.context,
1009
- "name": golden.name,
1010
- "comments": golden.comments,
1011
- }
1012
- for golden in goldens
1013
- ]
1004
+ json_data = []
1005
+ for golden in goldens:
1006
+ # Serialize turns as structured list of dicts
1007
+ turns_list = (
1008
+ json.loads(format_turns(golden.turns))
1009
+ if golden.turns
1010
+ else None
1011
+ )
1012
+ json_data.append(
1013
+ {
1014
+ "scenario": golden.scenario,
1015
+ "turns": turns_list,
1016
+ "expected_outcome": golden.expected_outcome,
1017
+ "user_description": golden.user_description,
1018
+ "context": golden.context,
1019
+ "name": golden.name,
1020
+ "comments": golden.comments,
1021
+ "additional_metadata": golden.additional_metadata,
1022
+ "custom_column_key_values": golden.custom_column_key_values,
1023
+ }
1024
+ )
1014
1025
  else:
1015
- json_data = [
1016
- {
1017
- "input": golden.input,
1018
- "actual_output": golden.actual_output,
1019
- "expected_output": golden.expected_output,
1020
- "retrieval_context": golden.retrieval_context,
1021
- "context": golden.context,
1022
- "name": golden.name,
1023
- "comments": golden.comments,
1024
- "source_file": golden.source_file,
1025
- }
1026
- for golden in goldens
1027
- ]
1026
+ json_data = []
1027
+ for golden in goldens:
1028
+ # Convert ToolCall lists to list[dict]
1029
+ def _dump_tools(tools):
1030
+ if not tools:
1031
+ return None
1032
+ dumped = []
1033
+ for t in tools:
1034
+ if hasattr(t, "model_dump"):
1035
+ dumped.append(
1036
+ t.model_dump(
1037
+ by_alias=True, exclude_none=True
1038
+ )
1039
+ )
1040
+ elif hasattr(t, "dict"):
1041
+ dumped.append(t.dict(exclude_none=True))
1042
+ else:
1043
+ dumped.append(t)
1044
+ return dumped if len(dumped) > 0 else None
1045
+
1046
+ json_data.append(
1047
+ {
1048
+ "input": golden.input,
1049
+ "actual_output": golden.actual_output,
1050
+ "expected_output": golden.expected_output,
1051
+ "retrieval_context": golden.retrieval_context,
1052
+ "context": golden.context,
1053
+ "name": golden.name,
1054
+ "comments": golden.comments,
1055
+ "source_file": golden.source_file,
1056
+ "tools_called": _dump_tools(
1057
+ golden.tools_called
1058
+ ),
1059
+ "expected_tools": _dump_tools(
1060
+ golden.expected_tools
1061
+ ),
1062
+ "additional_metadata": golden.additional_metadata,
1063
+ "custom_column_key_values": golden.custom_column_key_values,
1064
+ }
1065
+ )
1028
1066
  json.dump(json_data, file, indent=4, ensure_ascii=False)
1029
1067
  elif file_type == "csv":
1030
1068
  with open(
@@ -1041,6 +1079,8 @@ class EvaluationDataset:
1041
1079
  "context",
1042
1080
  "name",
1043
1081
  "comments",
1082
+ "additional_metadata",
1083
+ "custom_column_key_values",
1044
1084
  ]
1045
1085
  )
1046
1086
  for golden in goldens:
@@ -1054,6 +1094,21 @@ class EvaluationDataset:
1054
1094
  if golden.turns is not None
1055
1095
  else None
1056
1096
  )
1097
+ additional_metadata = (
1098
+ json.dumps(
1099
+ golden.additional_metadata, ensure_ascii=False
1100
+ )
1101
+ if golden.additional_metadata is not None
1102
+ else None
1103
+ )
1104
+ custom_cols = (
1105
+ json.dumps(
1106
+ golden.custom_column_key_values,
1107
+ ensure_ascii=False,
1108
+ )
1109
+ if golden.custom_column_key_values
1110
+ else None
1111
+ )
1057
1112
  writer.writerow(
1058
1113
  [
1059
1114
  golden.scenario,
@@ -1063,6 +1118,8 @@ class EvaluationDataset:
1063
1118
  context,
1064
1119
  golden.name,
1065
1120
  golden.comments,
1121
+ additional_metadata,
1122
+ custom_cols,
1066
1123
  ]
1067
1124
  )
1068
1125
  else:
@@ -1076,6 +1133,10 @@ class EvaluationDataset:
1076
1133
  "name",
1077
1134
  "comments",
1078
1135
  "source_file",
1136
+ "tools_called",
1137
+ "expected_tools",
1138
+ "additional_metadata",
1139
+ "custom_column_key_values",
1079
1140
  ]
1080
1141
  )
1081
1142
  for golden in goldens:
@@ -1089,6 +1150,42 @@ class EvaluationDataset:
1089
1150
  if golden.context is not None
1090
1151
  else None
1091
1152
  )
1153
+
1154
+ # Dump tools as JSON strings for CSV
1155
+ def _dump_tools_csv(tools):
1156
+ if not tools:
1157
+ return None
1158
+ dumped = []
1159
+ for t in tools:
1160
+ if hasattr(t, "model_dump"):
1161
+ dumped.append(
1162
+ t.model_dump(
1163
+ by_alias=True, exclude_none=True
1164
+ )
1165
+ )
1166
+ elif hasattr(t, "dict"):
1167
+ dumped.append(t.dict(exclude_none=True))
1168
+ else:
1169
+ dumped.append(t)
1170
+ return json.dumps(dumped, ensure_ascii=False)
1171
+
1172
+ tools_called = _dump_tools_csv(golden.tools_called)
1173
+ expected_tools = _dump_tools_csv(golden.expected_tools)
1174
+ additional_metadata = (
1175
+ json.dumps(
1176
+ golden.additional_metadata, ensure_ascii=False
1177
+ )
1178
+ if golden.additional_metadata is not None
1179
+ else None
1180
+ )
1181
+ custom_cols = (
1182
+ json.dumps(
1183
+ golden.custom_column_key_values,
1184
+ ensure_ascii=False,
1185
+ )
1186
+ if golden.custom_column_key_values
1187
+ else None
1188
+ )
1092
1189
  writer.writerow(
1093
1190
  [
1094
1191
  golden.input,
@@ -1099,6 +1196,10 @@ class EvaluationDataset:
1099
1196
  golden.name,
1100
1197
  golden.comments,
1101
1198
  golden.source_file,
1199
+ tools_called,
1200
+ expected_tools,
1201
+ additional_metadata,
1202
+ custom_cols,
1102
1203
  ]
1103
1204
  )
1104
1205
  elif file_type == "jsonl":
@@ -1106,7 +1207,9 @@ class EvaluationDataset:
1106
1207
  for golden in goldens:
1107
1208
  if self._multi_turn:
1108
1209
  turns = (
1109
- format_turns(golden.turns) if golden.turns else None
1210
+ json.loads(format_turns(golden.turns))
1211
+ if golden.turns
1212
+ else None
1110
1213
  )
1111
1214
  record = {
1112
1215
  "scenario": golden.scenario,
@@ -1114,6 +1217,10 @@ class EvaluationDataset:
1114
1217
  "expected_outcome": golden.expected_outcome,
1115
1218
  "user_description": golden.user_description,
1116
1219
  "context": golden.context,
1220
+ "name": golden.name,
1221
+ "comments": golden.comments,
1222
+ "additional_metadata": golden.additional_metadata,
1223
+ "custom_column_key_values": golden.custom_column_key_values,
1117
1224
  }
1118
1225
  else:
1119
1226
  retrieval_context = (
@@ -1126,12 +1233,37 @@ class EvaluationDataset:
1126
1233
  if golden.context is not None
1127
1234
  else None
1128
1235
  )
1236
+
1237
+ # Convert ToolCall lists to list[dict]
1238
+ def _dump_tools(tools):
1239
+ if not tools:
1240
+ return None
1241
+ dumped = []
1242
+ for t in tools:
1243
+ if hasattr(t, "model_dump"):
1244
+ dumped.append(
1245
+ t.model_dump(
1246
+ by_alias=True, exclude_none=True
1247
+ )
1248
+ )
1249
+ elif hasattr(t, "dict"):
1250
+ dumped.append(t.dict(exclude_none=True))
1251
+ else:
1252
+ dumped.append(t)
1253
+ return dumped if len(dumped) > 0 else None
1254
+
1129
1255
  record = {
1130
1256
  "input": golden.input,
1131
1257
  "actual_output": golden.actual_output,
1132
1258
  "expected_output": golden.expected_output,
1133
1259
  "retrieval_context": retrieval_context,
1134
1260
  "context": context,
1261
+ "tools_called": _dump_tools(golden.tools_called),
1262
+ "expected_tools": _dump_tools(
1263
+ golden.expected_tools
1264
+ ),
1265
+ "additional_metadata": golden.additional_metadata,
1266
+ "custom_column_key_values": golden.custom_column_key_values,
1135
1267
  }
1136
1268
 
1137
1269
  file.write(json.dumps(record, ensure_ascii=False) + "\n")
deepeval/dataset/utils.py CHANGED
@@ -111,12 +111,36 @@ def trimAndLoadJson(input_string: str) -> Any:
111
111
  def format_turns(turns: List[Turn]) -> str:
112
112
  res = []
113
113
  for turn in turns:
114
+ # Safely convert nested Pydantic models (ToolCall/MCP calls) to dicts
115
+ def _dump_list(models):
116
+ if not models:
117
+ return None
118
+ dumped = []
119
+ for m in models:
120
+ if hasattr(m, "model_dump"):
121
+ dumped.append(
122
+ m.model_dump(by_alias=True, exclude_none=True)
123
+ )
124
+ elif hasattr(m, "dict"):
125
+ dumped.append(m.dict(exclude_none=True))
126
+ else:
127
+ dumped.append(m)
128
+ return dumped if len(dumped) > 0 else None
129
+
114
130
  cur_turn = {
115
131
  "role": turn.role,
116
132
  "content": turn.content,
133
+ "user_id": turn.user_id if turn.user_id is not None else None,
117
134
  "retrieval_context": (
118
135
  turn.retrieval_context if turn.retrieval_context else None
119
136
  ),
137
+ "tools_called": _dump_list(turn.tools_called),
138
+ "mcp_tools_called": _dump_list(turn.mcp_tools_called),
139
+ "mcp_resources_called": _dump_list(turn.mcp_resources_called),
140
+ "mcp_prompts_called": _dump_list(turn.mcp_prompts_called),
141
+ "additional_metadata": (
142
+ turn.additional_metadata if turn.additional_metadata else None
143
+ ),
120
144
  }
121
145
  res.append(cur_turn)
122
146
  try:
@@ -125,11 +149,17 @@ def format_turns(turns: List[Turn]) -> str:
125
149
  raise ValueError(f"Error serializing turns: {e}")
126
150
 
127
151
 
128
- def parse_turns(turns_str: str) -> List[Turn]:
129
- try:
130
- parsed = json.loads(turns_str)
131
- except json.JSONDecodeError as e:
132
- raise ValueError(f"Invalid JSON: {e}")
152
+ def parse_turns(turns_str: Any) -> List[Turn]:
153
+ # Accept either a JSON string or a Python list
154
+ if isinstance(turns_str, str):
155
+ try:
156
+ parsed = json.loads(turns_str)
157
+ except json.JSONDecodeError as e:
158
+ raise ValueError(f"Invalid JSON: {e}")
159
+ elif isinstance(turns_str, list):
160
+ parsed = turns_str
161
+ else:
162
+ raise TypeError("Expected a JSON string or a list of turns.")
133
163
 
134
164
  if not isinstance(parsed, list):
135
165
  raise TypeError("Expected a list of turns.")
@@ -145,15 +175,13 @@ def parse_turns(turns_str: str) -> List[Turn]:
145
175
  if "content" not in turn or not isinstance(turn["content"], str):
146
176
  raise ValueError(f"Turn at index {i} is missing a valid 'content'.")
147
177
 
148
- retrieval_context = turn.get("retrieval_context")
178
+ try:
179
+ # Pydantic v2
180
+ res.append(Turn.model_validate(turn))
181
+ except AttributeError:
182
+ # Pydantic v1 fallback
183
+ res.append(Turn.parse_obj(turn))
149
184
 
150
- res.append(
151
- Turn(
152
- role=turn["role"],
153
- content=turn["content"],
154
- retrieval_context=retrieval_context,
155
- )
156
- )
157
185
  return res
158
186
 
159
187
 
deepeval/errors.py CHANGED
@@ -1,6 +1,24 @@
1
- class MissingTestCaseParamsError(Exception):
1
+ class DeepEvalError(Exception):
2
+ """Base class for framework-originated errors.
3
+ If raised and not handled, it will abort the current operation.
4
+ We may also stringify instances of this class and attach them to traces or spans to surface
5
+ non-fatal diagnostics while allowing the run to continue.
6
+ """
7
+
8
+
9
+ class UserAppError(Exception):
10
+ """Represents exceptions thrown by user LLM apps/tools.
11
+ We record these on traces or spans and keep the overall evaluation run alive.
12
+ """
13
+
14
+
15
+ class MissingTestCaseParamsError(DeepEvalError):
16
+ """Required test case fields are missing."""
17
+
2
18
  pass
3
19
 
4
20
 
5
- class MismatchedTestCaseInputsError(Exception):
21
+ class MismatchedTestCaseInputsError(DeepEvalError):
22
+ """Inputs provided to a metric or test case are inconsistent or invalid."""
23
+
6
24
  pass