deepeval 3.6.8__py3-none-any.whl → 3.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/anthropic/__init__.py +19 -0
- deepeval/anthropic/extractors.py +94 -0
- deepeval/anthropic/patch.py +169 -0
- deepeval/anthropic/utils.py +225 -0
- deepeval/benchmarks/drop/drop.py +40 -14
- deepeval/benchmarks/ifeval/ifeval.py +2 -2
- deepeval/confident/types.py +4 -2
- deepeval/config/settings.py +258 -47
- deepeval/config/settings_manager.py +4 -0
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/evaluate/execute.py +1099 -633
- deepeval/integrations/crewai/handler.py +36 -0
- deepeval/integrations/langchain/callback.py +27 -2
- deepeval/integrations/llama_index/handler.py +58 -4
- deepeval/integrations/llama_index/utils.py +24 -0
- deepeval/metrics/__init__.py +5 -0
- deepeval/metrics/exact_match/__init__.py +0 -0
- deepeval/metrics/exact_match/exact_match.py +94 -0
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/pattern_match/__init__.py +0 -0
- deepeval/metrics/pattern_match/pattern_match.py +103 -0
- deepeval/metrics/task_completion/task_completion.py +9 -2
- deepeval/model_integrations/__init__.py +0 -0
- deepeval/model_integrations/utils.py +116 -0
- deepeval/models/base_model.py +3 -1
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/__init__.py +3 -1
- deepeval/openai/extractors.py +2 -2
- deepeval/openai/utils.py +7 -31
- deepeval/prompt/api.py +11 -10
- deepeval/prompt/prompt.py +5 -4
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/telemetry.py +3 -3
- deepeval/test_case/llm_test_case.py +3 -2
- deepeval/test_run/api.py +3 -2
- deepeval/test_run/cache.py +4 -3
- deepeval/test_run/test_run.py +24 -5
- deepeval/tracing/api.py +11 -10
- deepeval/tracing/otel/exporter.py +11 -0
- deepeval/tracing/patchers.py +102 -1
- deepeval/tracing/trace_context.py +13 -4
- deepeval/tracing/tracing.py +10 -1
- deepeval/tracing/types.py +8 -8
- deepeval/tracing/utils.py +9 -0
- deepeval/utils.py +44 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/METADATA +2 -2
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/RECORD +57 -47
- /deepeval/{openai → model_integrations}/types.py +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/WHEEL +0 -0
- {deepeval-3.6.8.dist-info → deepeval-3.7.0.dist-info}/entry_points.txt +0 -0
deepeval/dataset/dataset.py
CHANGED
|
@@ -951,6 +951,8 @@ class EvaluationDataset:
|
|
|
951
951
|
context=golden.context,
|
|
952
952
|
name=golden.name,
|
|
953
953
|
comments=golden.comments,
|
|
954
|
+
additional_metadata=golden.additional_metadata,
|
|
955
|
+
custom_column_key_values=golden.custom_column_key_values,
|
|
954
956
|
)
|
|
955
957
|
for golden in self.goldens
|
|
956
958
|
]
|
|
@@ -965,6 +967,10 @@ class EvaluationDataset:
|
|
|
965
967
|
name=golden.name,
|
|
966
968
|
comments=golden.comments,
|
|
967
969
|
source_file=golden.source_file,
|
|
970
|
+
tools_called=golden.tools_called,
|
|
971
|
+
expected_tools=golden.expected_tools,
|
|
972
|
+
additional_metadata=golden.additional_metadata,
|
|
973
|
+
custom_column_key_values=golden.custom_column_key_values,
|
|
968
974
|
)
|
|
969
975
|
for golden in self.goldens
|
|
970
976
|
]
|
|
@@ -995,36 +1001,68 @@ class EvaluationDataset:
|
|
|
995
1001
|
if file_type == "json":
|
|
996
1002
|
with open(full_file_path, "w", encoding="utf-8") as file:
|
|
997
1003
|
if self._multi_turn:
|
|
998
|
-
json_data = [
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1004
|
+
json_data = []
|
|
1005
|
+
for golden in goldens:
|
|
1006
|
+
# Serialize turns as structured list of dicts
|
|
1007
|
+
turns_list = (
|
|
1008
|
+
json.loads(format_turns(golden.turns))
|
|
1009
|
+
if golden.turns
|
|
1010
|
+
else None
|
|
1011
|
+
)
|
|
1012
|
+
json_data.append(
|
|
1013
|
+
{
|
|
1014
|
+
"scenario": golden.scenario,
|
|
1015
|
+
"turns": turns_list,
|
|
1016
|
+
"expected_outcome": golden.expected_outcome,
|
|
1017
|
+
"user_description": golden.user_description,
|
|
1018
|
+
"context": golden.context,
|
|
1019
|
+
"name": golden.name,
|
|
1020
|
+
"comments": golden.comments,
|
|
1021
|
+
"additional_metadata": golden.additional_metadata,
|
|
1022
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1023
|
+
}
|
|
1024
|
+
)
|
|
1014
1025
|
else:
|
|
1015
|
-
json_data = [
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1026
|
+
json_data = []
|
|
1027
|
+
for golden in goldens:
|
|
1028
|
+
# Convert ToolCall lists to list[dict]
|
|
1029
|
+
def _dump_tools(tools):
|
|
1030
|
+
if not tools:
|
|
1031
|
+
return None
|
|
1032
|
+
dumped = []
|
|
1033
|
+
for t in tools:
|
|
1034
|
+
if hasattr(t, "model_dump"):
|
|
1035
|
+
dumped.append(
|
|
1036
|
+
t.model_dump(
|
|
1037
|
+
by_alias=True, exclude_none=True
|
|
1038
|
+
)
|
|
1039
|
+
)
|
|
1040
|
+
elif hasattr(t, "dict"):
|
|
1041
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1042
|
+
else:
|
|
1043
|
+
dumped.append(t)
|
|
1044
|
+
return dumped if len(dumped) > 0 else None
|
|
1045
|
+
|
|
1046
|
+
json_data.append(
|
|
1047
|
+
{
|
|
1048
|
+
"input": golden.input,
|
|
1049
|
+
"actual_output": golden.actual_output,
|
|
1050
|
+
"expected_output": golden.expected_output,
|
|
1051
|
+
"retrieval_context": golden.retrieval_context,
|
|
1052
|
+
"context": golden.context,
|
|
1053
|
+
"name": golden.name,
|
|
1054
|
+
"comments": golden.comments,
|
|
1055
|
+
"source_file": golden.source_file,
|
|
1056
|
+
"tools_called": _dump_tools(
|
|
1057
|
+
golden.tools_called
|
|
1058
|
+
),
|
|
1059
|
+
"expected_tools": _dump_tools(
|
|
1060
|
+
golden.expected_tools
|
|
1061
|
+
),
|
|
1062
|
+
"additional_metadata": golden.additional_metadata,
|
|
1063
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1064
|
+
}
|
|
1065
|
+
)
|
|
1028
1066
|
json.dump(json_data, file, indent=4, ensure_ascii=False)
|
|
1029
1067
|
elif file_type == "csv":
|
|
1030
1068
|
with open(
|
|
@@ -1041,6 +1079,8 @@ class EvaluationDataset:
|
|
|
1041
1079
|
"context",
|
|
1042
1080
|
"name",
|
|
1043
1081
|
"comments",
|
|
1082
|
+
"additional_metadata",
|
|
1083
|
+
"custom_column_key_values",
|
|
1044
1084
|
]
|
|
1045
1085
|
)
|
|
1046
1086
|
for golden in goldens:
|
|
@@ -1054,6 +1094,21 @@ class EvaluationDataset:
|
|
|
1054
1094
|
if golden.turns is not None
|
|
1055
1095
|
else None
|
|
1056
1096
|
)
|
|
1097
|
+
additional_metadata = (
|
|
1098
|
+
json.dumps(
|
|
1099
|
+
golden.additional_metadata, ensure_ascii=False
|
|
1100
|
+
)
|
|
1101
|
+
if golden.additional_metadata is not None
|
|
1102
|
+
else None
|
|
1103
|
+
)
|
|
1104
|
+
custom_cols = (
|
|
1105
|
+
json.dumps(
|
|
1106
|
+
golden.custom_column_key_values,
|
|
1107
|
+
ensure_ascii=False,
|
|
1108
|
+
)
|
|
1109
|
+
if golden.custom_column_key_values
|
|
1110
|
+
else None
|
|
1111
|
+
)
|
|
1057
1112
|
writer.writerow(
|
|
1058
1113
|
[
|
|
1059
1114
|
golden.scenario,
|
|
@@ -1063,6 +1118,8 @@ class EvaluationDataset:
|
|
|
1063
1118
|
context,
|
|
1064
1119
|
golden.name,
|
|
1065
1120
|
golden.comments,
|
|
1121
|
+
additional_metadata,
|
|
1122
|
+
custom_cols,
|
|
1066
1123
|
]
|
|
1067
1124
|
)
|
|
1068
1125
|
else:
|
|
@@ -1076,6 +1133,10 @@ class EvaluationDataset:
|
|
|
1076
1133
|
"name",
|
|
1077
1134
|
"comments",
|
|
1078
1135
|
"source_file",
|
|
1136
|
+
"tools_called",
|
|
1137
|
+
"expected_tools",
|
|
1138
|
+
"additional_metadata",
|
|
1139
|
+
"custom_column_key_values",
|
|
1079
1140
|
]
|
|
1080
1141
|
)
|
|
1081
1142
|
for golden in goldens:
|
|
@@ -1089,6 +1150,42 @@ class EvaluationDataset:
|
|
|
1089
1150
|
if golden.context is not None
|
|
1090
1151
|
else None
|
|
1091
1152
|
)
|
|
1153
|
+
|
|
1154
|
+
# Dump tools as JSON strings for CSV
|
|
1155
|
+
def _dump_tools_csv(tools):
|
|
1156
|
+
if not tools:
|
|
1157
|
+
return None
|
|
1158
|
+
dumped = []
|
|
1159
|
+
for t in tools:
|
|
1160
|
+
if hasattr(t, "model_dump"):
|
|
1161
|
+
dumped.append(
|
|
1162
|
+
t.model_dump(
|
|
1163
|
+
by_alias=True, exclude_none=True
|
|
1164
|
+
)
|
|
1165
|
+
)
|
|
1166
|
+
elif hasattr(t, "dict"):
|
|
1167
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1168
|
+
else:
|
|
1169
|
+
dumped.append(t)
|
|
1170
|
+
return json.dumps(dumped, ensure_ascii=False)
|
|
1171
|
+
|
|
1172
|
+
tools_called = _dump_tools_csv(golden.tools_called)
|
|
1173
|
+
expected_tools = _dump_tools_csv(golden.expected_tools)
|
|
1174
|
+
additional_metadata = (
|
|
1175
|
+
json.dumps(
|
|
1176
|
+
golden.additional_metadata, ensure_ascii=False
|
|
1177
|
+
)
|
|
1178
|
+
if golden.additional_metadata is not None
|
|
1179
|
+
else None
|
|
1180
|
+
)
|
|
1181
|
+
custom_cols = (
|
|
1182
|
+
json.dumps(
|
|
1183
|
+
golden.custom_column_key_values,
|
|
1184
|
+
ensure_ascii=False,
|
|
1185
|
+
)
|
|
1186
|
+
if golden.custom_column_key_values
|
|
1187
|
+
else None
|
|
1188
|
+
)
|
|
1092
1189
|
writer.writerow(
|
|
1093
1190
|
[
|
|
1094
1191
|
golden.input,
|
|
@@ -1099,6 +1196,10 @@ class EvaluationDataset:
|
|
|
1099
1196
|
golden.name,
|
|
1100
1197
|
golden.comments,
|
|
1101
1198
|
golden.source_file,
|
|
1199
|
+
tools_called,
|
|
1200
|
+
expected_tools,
|
|
1201
|
+
additional_metadata,
|
|
1202
|
+
custom_cols,
|
|
1102
1203
|
]
|
|
1103
1204
|
)
|
|
1104
1205
|
elif file_type == "jsonl":
|
|
@@ -1106,7 +1207,9 @@ class EvaluationDataset:
|
|
|
1106
1207
|
for golden in goldens:
|
|
1107
1208
|
if self._multi_turn:
|
|
1108
1209
|
turns = (
|
|
1109
|
-
format_turns(golden.turns)
|
|
1210
|
+
json.loads(format_turns(golden.turns))
|
|
1211
|
+
if golden.turns
|
|
1212
|
+
else None
|
|
1110
1213
|
)
|
|
1111
1214
|
record = {
|
|
1112
1215
|
"scenario": golden.scenario,
|
|
@@ -1114,6 +1217,10 @@ class EvaluationDataset:
|
|
|
1114
1217
|
"expected_outcome": golden.expected_outcome,
|
|
1115
1218
|
"user_description": golden.user_description,
|
|
1116
1219
|
"context": golden.context,
|
|
1220
|
+
"name": golden.name,
|
|
1221
|
+
"comments": golden.comments,
|
|
1222
|
+
"additional_metadata": golden.additional_metadata,
|
|
1223
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1117
1224
|
}
|
|
1118
1225
|
else:
|
|
1119
1226
|
retrieval_context = (
|
|
@@ -1126,12 +1233,37 @@ class EvaluationDataset:
|
|
|
1126
1233
|
if golden.context is not None
|
|
1127
1234
|
else None
|
|
1128
1235
|
)
|
|
1236
|
+
|
|
1237
|
+
# Convert ToolCall lists to list[dict]
|
|
1238
|
+
def _dump_tools(tools):
|
|
1239
|
+
if not tools:
|
|
1240
|
+
return None
|
|
1241
|
+
dumped = []
|
|
1242
|
+
for t in tools:
|
|
1243
|
+
if hasattr(t, "model_dump"):
|
|
1244
|
+
dumped.append(
|
|
1245
|
+
t.model_dump(
|
|
1246
|
+
by_alias=True, exclude_none=True
|
|
1247
|
+
)
|
|
1248
|
+
)
|
|
1249
|
+
elif hasattr(t, "dict"):
|
|
1250
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1251
|
+
else:
|
|
1252
|
+
dumped.append(t)
|
|
1253
|
+
return dumped if len(dumped) > 0 else None
|
|
1254
|
+
|
|
1129
1255
|
record = {
|
|
1130
1256
|
"input": golden.input,
|
|
1131
1257
|
"actual_output": golden.actual_output,
|
|
1132
1258
|
"expected_output": golden.expected_output,
|
|
1133
1259
|
"retrieval_context": retrieval_context,
|
|
1134
1260
|
"context": context,
|
|
1261
|
+
"tools_called": _dump_tools(golden.tools_called),
|
|
1262
|
+
"expected_tools": _dump_tools(
|
|
1263
|
+
golden.expected_tools
|
|
1264
|
+
),
|
|
1265
|
+
"additional_metadata": golden.additional_metadata,
|
|
1266
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1135
1267
|
}
|
|
1136
1268
|
|
|
1137
1269
|
file.write(json.dumps(record, ensure_ascii=False) + "\n")
|
deepeval/dataset/utils.py
CHANGED
|
@@ -111,12 +111,36 @@ def trimAndLoadJson(input_string: str) -> Any:
|
|
|
111
111
|
def format_turns(turns: List[Turn]) -> str:
|
|
112
112
|
res = []
|
|
113
113
|
for turn in turns:
|
|
114
|
+
# Safely convert nested Pydantic models (ToolCall/MCP calls) to dicts
|
|
115
|
+
def _dump_list(models):
|
|
116
|
+
if not models:
|
|
117
|
+
return None
|
|
118
|
+
dumped = []
|
|
119
|
+
for m in models:
|
|
120
|
+
if hasattr(m, "model_dump"):
|
|
121
|
+
dumped.append(
|
|
122
|
+
m.model_dump(by_alias=True, exclude_none=True)
|
|
123
|
+
)
|
|
124
|
+
elif hasattr(m, "dict"):
|
|
125
|
+
dumped.append(m.dict(exclude_none=True))
|
|
126
|
+
else:
|
|
127
|
+
dumped.append(m)
|
|
128
|
+
return dumped if len(dumped) > 0 else None
|
|
129
|
+
|
|
114
130
|
cur_turn = {
|
|
115
131
|
"role": turn.role,
|
|
116
132
|
"content": turn.content,
|
|
133
|
+
"user_id": turn.user_id if turn.user_id is not None else None,
|
|
117
134
|
"retrieval_context": (
|
|
118
135
|
turn.retrieval_context if turn.retrieval_context else None
|
|
119
136
|
),
|
|
137
|
+
"tools_called": _dump_list(turn.tools_called),
|
|
138
|
+
"mcp_tools_called": _dump_list(turn.mcp_tools_called),
|
|
139
|
+
"mcp_resources_called": _dump_list(turn.mcp_resources_called),
|
|
140
|
+
"mcp_prompts_called": _dump_list(turn.mcp_prompts_called),
|
|
141
|
+
"additional_metadata": (
|
|
142
|
+
turn.additional_metadata if turn.additional_metadata else None
|
|
143
|
+
),
|
|
120
144
|
}
|
|
121
145
|
res.append(cur_turn)
|
|
122
146
|
try:
|
|
@@ -125,11 +149,17 @@ def format_turns(turns: List[Turn]) -> str:
|
|
|
125
149
|
raise ValueError(f"Error serializing turns: {e}")
|
|
126
150
|
|
|
127
151
|
|
|
128
|
-
def parse_turns(turns_str:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
152
|
+
def parse_turns(turns_str: Any) -> List[Turn]:
|
|
153
|
+
# Accept either a JSON string or a Python list
|
|
154
|
+
if isinstance(turns_str, str):
|
|
155
|
+
try:
|
|
156
|
+
parsed = json.loads(turns_str)
|
|
157
|
+
except json.JSONDecodeError as e:
|
|
158
|
+
raise ValueError(f"Invalid JSON: {e}")
|
|
159
|
+
elif isinstance(turns_str, list):
|
|
160
|
+
parsed = turns_str
|
|
161
|
+
else:
|
|
162
|
+
raise TypeError("Expected a JSON string or a list of turns.")
|
|
133
163
|
|
|
134
164
|
if not isinstance(parsed, list):
|
|
135
165
|
raise TypeError("Expected a list of turns.")
|
|
@@ -145,15 +175,13 @@ def parse_turns(turns_str: str) -> List[Turn]:
|
|
|
145
175
|
if "content" not in turn or not isinstance(turn["content"], str):
|
|
146
176
|
raise ValueError(f"Turn at index {i} is missing a valid 'content'.")
|
|
147
177
|
|
|
148
|
-
|
|
178
|
+
try:
|
|
179
|
+
# Pydantic v2
|
|
180
|
+
res.append(Turn.model_validate(turn))
|
|
181
|
+
except AttributeError:
|
|
182
|
+
# Pydantic v1 fallback
|
|
183
|
+
res.append(Turn.parse_obj(turn))
|
|
149
184
|
|
|
150
|
-
res.append(
|
|
151
|
-
Turn(
|
|
152
|
-
role=turn["role"],
|
|
153
|
-
content=turn["content"],
|
|
154
|
-
retrieval_context=retrieval_context,
|
|
155
|
-
)
|
|
156
|
-
)
|
|
157
185
|
return res
|
|
158
186
|
|
|
159
187
|
|