deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/anthropic/__init__.py +19 -0
  3. deepeval/anthropic/extractors.py +94 -0
  4. deepeval/anthropic/patch.py +169 -0
  5. deepeval/anthropic/utils.py +225 -0
  6. deepeval/benchmarks/drop/drop.py +40 -14
  7. deepeval/benchmarks/ifeval/ifeval.py +2 -2
  8. deepeval/confident/types.py +4 -2
  9. deepeval/config/settings.py +258 -47
  10. deepeval/config/settings_manager.py +4 -0
  11. deepeval/config/utils.py +5 -0
  12. deepeval/dataset/dataset.py +162 -30
  13. deepeval/dataset/utils.py +41 -13
  14. deepeval/evaluate/execute.py +1099 -633
  15. deepeval/integrations/crewai/handler.py +36 -0
  16. deepeval/integrations/langchain/callback.py +27 -2
  17. deepeval/integrations/llama_index/handler.py +58 -4
  18. deepeval/integrations/llama_index/utils.py +24 -0
  19. deepeval/metrics/__init__.py +5 -0
  20. deepeval/metrics/exact_match/__init__.py +0 -0
  21. deepeval/metrics/exact_match/exact_match.py +94 -0
  22. deepeval/metrics/indicator.py +21 -1
  23. deepeval/metrics/pattern_match/__init__.py +0 -0
  24. deepeval/metrics/pattern_match/pattern_match.py +103 -0
  25. deepeval/metrics/task_completion/task_completion.py +9 -2
  26. deepeval/model_integrations/__init__.py +0 -0
  27. deepeval/model_integrations/utils.py +116 -0
  28. deepeval/models/base_model.py +3 -1
  29. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  30. deepeval/models/llms/openai_model.py +10 -1
  31. deepeval/models/retry_policy.py +103 -20
  32. deepeval/openai/__init__.py +3 -1
  33. deepeval/openai/extractors.py +2 -2
  34. deepeval/openai/utils.py +7 -31
  35. deepeval/prompt/api.py +11 -10
  36. deepeval/prompt/prompt.py +5 -4
  37. deepeval/simulator/conversation_simulator.py +25 -18
  38. deepeval/synthesizer/chunking/context_generator.py +9 -1
  39. deepeval/telemetry.py +3 -3
  40. deepeval/test_case/llm_test_case.py +3 -2
  41. deepeval/test_run/api.py +3 -2
  42. deepeval/test_run/cache.py +4 -3
  43. deepeval/test_run/test_run.py +24 -5
  44. deepeval/tracing/api.py +11 -10
  45. deepeval/tracing/otel/exporter.py +11 -0
  46. deepeval/tracing/patchers.py +102 -1
  47. deepeval/tracing/trace_context.py +13 -4
  48. deepeval/tracing/tracing.py +10 -1
  49. deepeval/tracing/types.py +8 -8
  50. deepeval/tracing/utils.py +9 -0
  51. deepeval/utils.py +44 -2
  52. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
  53. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
  54. /deepeval/{openai → model_integrations}/types.py +0 -0
  55. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
  57. {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
@@ -951,6 +951,8 @@ class EvaluationDataset:
951
951
  context=golden.context,
952
952
  name=golden.name,
953
953
  comments=golden.comments,
954
+ additional_metadata=golden.additional_metadata,
955
+ custom_column_key_values=golden.custom_column_key_values,
954
956
  )
955
957
  for golden in self.goldens
956
958
  ]
@@ -965,6 +967,10 @@ class EvaluationDataset:
965
967
  name=golden.name,
966
968
  comments=golden.comments,
967
969
  source_file=golden.source_file,
970
+ tools_called=golden.tools_called,
971
+ expected_tools=golden.expected_tools,
972
+ additional_metadata=golden.additional_metadata,
973
+ custom_column_key_values=golden.custom_column_key_values,
968
974
  )
969
975
  for golden in self.goldens
970
976
  ]
@@ -995,36 +1001,68 @@ class EvaluationDataset:
995
1001
  if file_type == "json":
996
1002
  with open(full_file_path, "w", encoding="utf-8") as file:
997
1003
  if self._multi_turn:
998
- json_data = [
999
- {
1000
- "scenario": golden.scenario,
1001
- "turns": (
1002
- format_turns(golden.turns)
1003
- if golden.turns
1004
- else None
1005
- ),
1006
- "expected_outcome": golden.expected_outcome,
1007
- "user_description": golden.user_description,
1008
- "context": golden.context,
1009
- "name": golden.name,
1010
- "comments": golden.comments,
1011
- }
1012
- for golden in goldens
1013
- ]
1004
+ json_data = []
1005
+ for golden in goldens:
1006
+ # Serialize turns as structured list of dicts
1007
+ turns_list = (
1008
+ json.loads(format_turns(golden.turns))
1009
+ if golden.turns
1010
+ else None
1011
+ )
1012
+ json_data.append(
1013
+ {
1014
+ "scenario": golden.scenario,
1015
+ "turns": turns_list,
1016
+ "expected_outcome": golden.expected_outcome,
1017
+ "user_description": golden.user_description,
1018
+ "context": golden.context,
1019
+ "name": golden.name,
1020
+ "comments": golden.comments,
1021
+ "additional_metadata": golden.additional_metadata,
1022
+ "custom_column_key_values": golden.custom_column_key_values,
1023
+ }
1024
+ )
1014
1025
  else:
1015
- json_data = [
1016
- {
1017
- "input": golden.input,
1018
- "actual_output": golden.actual_output,
1019
- "expected_output": golden.expected_output,
1020
- "retrieval_context": golden.retrieval_context,
1021
- "context": golden.context,
1022
- "name": golden.name,
1023
- "comments": golden.comments,
1024
- "source_file": golden.source_file,
1025
- }
1026
- for golden in goldens
1027
- ]
1026
+ json_data = []
1027
+ for golden in goldens:
1028
+ # Convert ToolCall lists to list[dict]
1029
+ def _dump_tools(tools):
1030
+ if not tools:
1031
+ return None
1032
+ dumped = []
1033
+ for t in tools:
1034
+ if hasattr(t, "model_dump"):
1035
+ dumped.append(
1036
+ t.model_dump(
1037
+ by_alias=True, exclude_none=True
1038
+ )
1039
+ )
1040
+ elif hasattr(t, "dict"):
1041
+ dumped.append(t.dict(exclude_none=True))
1042
+ else:
1043
+ dumped.append(t)
1044
+ return dumped if len(dumped) > 0 else None
1045
+
1046
+ json_data.append(
1047
+ {
1048
+ "input": golden.input,
1049
+ "actual_output": golden.actual_output,
1050
+ "expected_output": golden.expected_output,
1051
+ "retrieval_context": golden.retrieval_context,
1052
+ "context": golden.context,
1053
+ "name": golden.name,
1054
+ "comments": golden.comments,
1055
+ "source_file": golden.source_file,
1056
+ "tools_called": _dump_tools(
1057
+ golden.tools_called
1058
+ ),
1059
+ "expected_tools": _dump_tools(
1060
+ golden.expected_tools
1061
+ ),
1062
+ "additional_metadata": golden.additional_metadata,
1063
+ "custom_column_key_values": golden.custom_column_key_values,
1064
+ }
1065
+ )
1028
1066
  json.dump(json_data, file, indent=4, ensure_ascii=False)
1029
1067
  elif file_type == "csv":
1030
1068
  with open(
@@ -1041,6 +1079,8 @@ class EvaluationDataset:
1041
1079
  "context",
1042
1080
  "name",
1043
1081
  "comments",
1082
+ "additional_metadata",
1083
+ "custom_column_key_values",
1044
1084
  ]
1045
1085
  )
1046
1086
  for golden in goldens:
@@ -1054,6 +1094,21 @@ class EvaluationDataset:
1054
1094
  if golden.turns is not None
1055
1095
  else None
1056
1096
  )
1097
+ additional_metadata = (
1098
+ json.dumps(
1099
+ golden.additional_metadata, ensure_ascii=False
1100
+ )
1101
+ if golden.additional_metadata is not None
1102
+ else None
1103
+ )
1104
+ custom_cols = (
1105
+ json.dumps(
1106
+ golden.custom_column_key_values,
1107
+ ensure_ascii=False,
1108
+ )
1109
+ if golden.custom_column_key_values
1110
+ else None
1111
+ )
1057
1112
  writer.writerow(
1058
1113
  [
1059
1114
  golden.scenario,
@@ -1063,6 +1118,8 @@ class EvaluationDataset:
1063
1118
  context,
1064
1119
  golden.name,
1065
1120
  golden.comments,
1121
+ additional_metadata,
1122
+ custom_cols,
1066
1123
  ]
1067
1124
  )
1068
1125
  else:
@@ -1076,6 +1133,10 @@ class EvaluationDataset:
1076
1133
  "name",
1077
1134
  "comments",
1078
1135
  "source_file",
1136
+ "tools_called",
1137
+ "expected_tools",
1138
+ "additional_metadata",
1139
+ "custom_column_key_values",
1079
1140
  ]
1080
1141
  )
1081
1142
  for golden in goldens:
@@ -1089,6 +1150,42 @@ class EvaluationDataset:
1089
1150
  if golden.context is not None
1090
1151
  else None
1091
1152
  )
1153
+
1154
+ # Dump tools as JSON strings for CSV
1155
+ def _dump_tools_csv(tools):
1156
+ if not tools:
1157
+ return None
1158
+ dumped = []
1159
+ for t in tools:
1160
+ if hasattr(t, "model_dump"):
1161
+ dumped.append(
1162
+ t.model_dump(
1163
+ by_alias=True, exclude_none=True
1164
+ )
1165
+ )
1166
+ elif hasattr(t, "dict"):
1167
+ dumped.append(t.dict(exclude_none=True))
1168
+ else:
1169
+ dumped.append(t)
1170
+ return json.dumps(dumped, ensure_ascii=False)
1171
+
1172
+ tools_called = _dump_tools_csv(golden.tools_called)
1173
+ expected_tools = _dump_tools_csv(golden.expected_tools)
1174
+ additional_metadata = (
1175
+ json.dumps(
1176
+ golden.additional_metadata, ensure_ascii=False
1177
+ )
1178
+ if golden.additional_metadata is not None
1179
+ else None
1180
+ )
1181
+ custom_cols = (
1182
+ json.dumps(
1183
+ golden.custom_column_key_values,
1184
+ ensure_ascii=False,
1185
+ )
1186
+ if golden.custom_column_key_values
1187
+ else None
1188
+ )
1092
1189
  writer.writerow(
1093
1190
  [
1094
1191
  golden.input,
@@ -1099,6 +1196,10 @@ class EvaluationDataset:
1099
1196
  golden.name,
1100
1197
  golden.comments,
1101
1198
  golden.source_file,
1199
+ tools_called,
1200
+ expected_tools,
1201
+ additional_metadata,
1202
+ custom_cols,
1102
1203
  ]
1103
1204
  )
1104
1205
  elif file_type == "jsonl":
@@ -1106,7 +1207,9 @@ class EvaluationDataset:
1106
1207
  for golden in goldens:
1107
1208
  if self._multi_turn:
1108
1209
  turns = (
1109
- format_turns(golden.turns) if golden.turns else None
1210
+ json.loads(format_turns(golden.turns))
1211
+ if golden.turns
1212
+ else None
1110
1213
  )
1111
1214
  record = {
1112
1215
  "scenario": golden.scenario,
@@ -1114,6 +1217,10 @@ class EvaluationDataset:
1114
1217
  "expected_outcome": golden.expected_outcome,
1115
1218
  "user_description": golden.user_description,
1116
1219
  "context": golden.context,
1220
+ "name": golden.name,
1221
+ "comments": golden.comments,
1222
+ "additional_metadata": golden.additional_metadata,
1223
+ "custom_column_key_values": golden.custom_column_key_values,
1117
1224
  }
1118
1225
  else:
1119
1226
  retrieval_context = (
@@ -1126,12 +1233,37 @@ class EvaluationDataset:
1126
1233
  if golden.context is not None
1127
1234
  else None
1128
1235
  )
1236
+
1237
+ # Convert ToolCall lists to list[dict]
1238
+ def _dump_tools(tools):
1239
+ if not tools:
1240
+ return None
1241
+ dumped = []
1242
+ for t in tools:
1243
+ if hasattr(t, "model_dump"):
1244
+ dumped.append(
1245
+ t.model_dump(
1246
+ by_alias=True, exclude_none=True
1247
+ )
1248
+ )
1249
+ elif hasattr(t, "dict"):
1250
+ dumped.append(t.dict(exclude_none=True))
1251
+ else:
1252
+ dumped.append(t)
1253
+ return dumped if len(dumped) > 0 else None
1254
+
1129
1255
  record = {
1130
1256
  "input": golden.input,
1131
1257
  "actual_output": golden.actual_output,
1132
1258
  "expected_output": golden.expected_output,
1133
1259
  "retrieval_context": retrieval_context,
1134
1260
  "context": context,
1261
+ "tools_called": _dump_tools(golden.tools_called),
1262
+ "expected_tools": _dump_tools(
1263
+ golden.expected_tools
1264
+ ),
1265
+ "additional_metadata": golden.additional_metadata,
1266
+ "custom_column_key_values": golden.custom_column_key_values,
1135
1267
  }
1136
1268
 
1137
1269
  file.write(json.dumps(record, ensure_ascii=False) + "\n")
deepeval/dataset/utils.py CHANGED
@@ -111,12 +111,36 @@ def trimAndLoadJson(input_string: str) -> Any:
111
111
  def format_turns(turns: List[Turn]) -> str:
112
112
  res = []
113
113
  for turn in turns:
114
+ # Safely convert nested Pydantic models (ToolCall/MCP calls) to dicts
115
+ def _dump_list(models):
116
+ if not models:
117
+ return None
118
+ dumped = []
119
+ for m in models:
120
+ if hasattr(m, "model_dump"):
121
+ dumped.append(
122
+ m.model_dump(by_alias=True, exclude_none=True)
123
+ )
124
+ elif hasattr(m, "dict"):
125
+ dumped.append(m.dict(exclude_none=True))
126
+ else:
127
+ dumped.append(m)
128
+ return dumped if len(dumped) > 0 else None
129
+
114
130
  cur_turn = {
115
131
  "role": turn.role,
116
132
  "content": turn.content,
133
+ "user_id": turn.user_id if turn.user_id is not None else None,
117
134
  "retrieval_context": (
118
135
  turn.retrieval_context if turn.retrieval_context else None
119
136
  ),
137
+ "tools_called": _dump_list(turn.tools_called),
138
+ "mcp_tools_called": _dump_list(turn.mcp_tools_called),
139
+ "mcp_resources_called": _dump_list(turn.mcp_resources_called),
140
+ "mcp_prompts_called": _dump_list(turn.mcp_prompts_called),
141
+ "additional_metadata": (
142
+ turn.additional_metadata if turn.additional_metadata else None
143
+ ),
120
144
  }
121
145
  res.append(cur_turn)
122
146
  try:
@@ -125,11 +149,17 @@ def format_turns(turns: List[Turn]) -> str:
125
149
  raise ValueError(f"Error serializing turns: {e}")
126
150
 
127
151
 
128
- def parse_turns(turns_str: str) -> List[Turn]:
129
- try:
130
- parsed = json.loads(turns_str)
131
- except json.JSONDecodeError as e:
132
- raise ValueError(f"Invalid JSON: {e}")
152
+ def parse_turns(turns_str: Any) -> List[Turn]:
153
+ # Accept either a JSON string or a Python list
154
+ if isinstance(turns_str, str):
155
+ try:
156
+ parsed = json.loads(turns_str)
157
+ except json.JSONDecodeError as e:
158
+ raise ValueError(f"Invalid JSON: {e}")
159
+ elif isinstance(turns_str, list):
160
+ parsed = turns_str
161
+ else:
162
+ raise TypeError("Expected a JSON string or a list of turns.")
133
163
 
134
164
  if not isinstance(parsed, list):
135
165
  raise TypeError("Expected a list of turns.")
@@ -145,15 +175,13 @@ def parse_turns(turns_str: str) -> List[Turn]:
145
175
  if "content" not in turn or not isinstance(turn["content"], str):
146
176
  raise ValueError(f"Turn at index {i} is missing a valid 'content'.")
147
177
 
148
- retrieval_context = turn.get("retrieval_context")
178
+ try:
179
+ # Pydantic v2
180
+ res.append(Turn.model_validate(turn))
181
+ except AttributeError:
182
+ # Pydantic v1 fallback
183
+ res.append(Turn.parse_obj(turn))
149
184
 
150
- res.append(
151
- Turn(
152
- role=turn["role"],
153
- content=turn["content"],
154
- retrieval_context=retrieval_context,
155
- )
156
- )
157
185
  return res
158
186
 
159
187