eval-ai-library 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.3.1.dist-info/METADATA +1042 -0
- eval_ai_library-0.3.1.dist-info/RECORD +34 -0
- eval_lib/__init__.py +19 -6
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +9 -3
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +13 -4
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +24 -23
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
- eval_lib/datagenerator/datagenerator.py +208 -12
- eval_lib/datagenerator/document_loader.py +29 -29
- eval_lib/evaluate.py +0 -22
- eval_lib/llm_client.py +221 -78
- eval_lib/metric_pattern.py +208 -152
- eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +8 -2
- eval_lib/metrics/bias_metric/bias.py +12 -2
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +9 -2
- eval_lib/metrics/custom_metric/custom_eval.py +238 -204
- eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
- eval_lib/metrics/geval/geval.py +8 -2
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
- eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
- eval_lib/utils.py +44 -29
- eval_ai_library-0.2.2.dist-info/METADATA +0 -779
- eval_ai_library-0.2.2.dist-info/RECORD +0 -34
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
eval_ai_library-0.3.1.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
|
|
2
|
+
eval_lib/__init__.py,sha256=Jayvtz47_-0POIspT_LJKZ6jmWyf0fQc9fqQ5KvdPRI,3029
|
|
3
|
+
eval_lib/evaluate.py,sha256=GjlXZb5dnl44LCaJwdkyGCYcC50zoNZn3NrofzNAVJ0,11490
|
|
4
|
+
eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
|
|
5
|
+
eval_lib/llm_client.py,sha256=3eMcarKLkDLDVh4AOxgWbaIzXlzpqsmEfJXNTBonNic,13633
|
|
6
|
+
eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
|
|
7
|
+
eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
|
|
8
|
+
eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
|
|
9
|
+
eval_lib/testcases_schema.py,sha256=qI4o6kX0jH1DR3sHGXUnu3Cyt2oq7rGlsMlOaXSt6F4,696
|
|
10
|
+
eval_lib/utils.py,sha256=-hwagFFn3_QjgyLqF8Qx7JIkpgOEI8-F14eycog3bgc,3141
|
|
11
|
+
eval_lib/agent_metrics/__init__.py,sha256=20Y4BsicD2s7OkOBQPBvB2JKStBDtplv52_q6q35Vgo,525
|
|
12
|
+
eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=7j89HOTsu0rMoFnznTjMl-tqQpnZlS6ZIdrHPueEbb8,8289
|
|
13
|
+
eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=kJsYj9H3W3Mw2iBqj3Br_glP8gU6_diFPiJhRSnHGxg,9225
|
|
14
|
+
eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=v5cO07cymo9GWSZ34ryAx3ya4DDBiRWih9w0bm_j_R8,12497
|
|
15
|
+
eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py,sha256=qbVMtD6EWKah27FogLEiEh6pBX-k2wwKbwM_kFkvYeQ,4220
|
|
16
|
+
eval_lib/datagenerator/datagenerator.py,sha256=NQZIQuSCmryxIT3lTLS1PpJjENmGqARtR-zTiQ8OvRk,15513
|
|
17
|
+
eval_lib/datagenerator/document_loader.py,sha256=vnQUz_Dxb3SxcVPUmMXZe-rgfPp6OfHb6D2Ie9iqPms,17025
|
|
18
|
+
eval_lib/datagenerator/prompts.py,sha256=iQAYitAbLud3vWJnXGh_OCF4syWYS_S71zZEPI6qYAU,7213
|
|
19
|
+
eval_lib/metrics/__init__.py,sha256=3qClCCjPXt5i0c38g5krfuQnqlAXEl-jhAHy1C_ICMY,1213
|
|
20
|
+
eval_lib/metrics/answer_precision_metric/answer_precision.py,sha256=AxPmwzGFU7tnTrrZuQZ7ow4nNSD-blDHdAGwhMHMxjM,15040
|
|
21
|
+
eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=-Xb9I-BVMDf5E55FbJzP6IyvD6IVTUPBI-uCrRnEboc,8522
|
|
22
|
+
eval_lib/metrics/bias_metric/bias.py,sha256=BVH8xlTUTRfVG_F1kauwpGAkVKBkUWhM9rUsrrLhpRU,4020
|
|
23
|
+
eval_lib/metrics/contextual_precision_metric/contextual_precision.py,sha256=CQOb6uR2KeffTkhPSqZae56sX5tXMr0pJVM5W_wU1fU,3993
|
|
24
|
+
eval_lib/metrics/contextual_recall_metric/contextual_recall.py,sha256=iw73_hGLWklHZSBkCRkPDNUt1xD5dknA_7CZ6Efkf5w,3913
|
|
25
|
+
eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=G1cYlA95YNcpEqQsALVi6ZbyNzWr9ccO2DATcsES5pk,6546
|
|
26
|
+
eval_lib/metrics/custom_metric/custom_eval.py,sha256=Ov3-i6IytaJXlzcMgp46SRSeb8scyhqyuR2BqUtDFoM,11385
|
|
27
|
+
eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK7Tq-UAJXNql8VKjgtuqlDhA,5930
|
|
28
|
+
eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
|
|
29
|
+
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
|
|
30
|
+
eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
|
|
31
|
+
eval_ai_library-0.3.1.dist-info/METADATA,sha256=UytyyuWVrL3CuvK7hQC_y-AqoabHEPI0euolxhmfZrQ,37706
|
|
32
|
+
eval_ai_library-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
33
|
+
eval_ai_library-0.3.1.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
|
|
34
|
+
eval_ai_library-0.3.1.dist-info/RECORD,,
|
eval_lib/__init__.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# eval_lib/__init__.py
|
|
2
|
+
|
|
1
3
|
"""
|
|
2
4
|
Eval AI Library - Comprehensive AI Model Evaluation Framework
|
|
3
5
|
|
|
@@ -5,12 +7,12 @@ A powerful library for evaluating AI models with support for multiple LLM provid
|
|
|
5
7
|
and a wide range of evaluation metrics for RAG systems and AI agents.
|
|
6
8
|
"""
|
|
7
9
|
|
|
8
|
-
__version__ = "0.1
|
|
9
|
-
__author__ = "Aleksandr
|
|
10
|
+
__version__ = "0.3.1"
|
|
11
|
+
__author__ = "Aleksandr Meshkov"
|
|
10
12
|
|
|
11
13
|
# Core evaluation functions
|
|
12
14
|
from eval_lib.evaluate import evaluate, evaluate_conversations
|
|
13
|
-
from eval_lib.utils import score_agg
|
|
15
|
+
from eval_lib.utils import score_agg, extract_json_block
|
|
14
16
|
|
|
15
17
|
# Test case schemas
|
|
16
18
|
from eval_lib.testcases_schema import (
|
|
@@ -63,9 +65,20 @@ from eval_lib.agent_metrics import (
|
|
|
63
65
|
KnowledgeRetentionMetric
|
|
64
66
|
)
|
|
65
67
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
|
|
69
|
+
def __getattr__(name):
|
|
70
|
+
"""
|
|
71
|
+
Ленивый импорт для модулей с тяжёлыми зависимостями.
|
|
72
|
+
DataGenerator импортируется только когда реально используется.
|
|
73
|
+
"""
|
|
74
|
+
if name == "DataGenerator":
|
|
75
|
+
from eval_lib.datagenerator.datagenerator import DataGenerator
|
|
76
|
+
return DataGenerator
|
|
77
|
+
if name == "DocumentLoader":
|
|
78
|
+
from eval_lib.datagenerator.document_loader import DocumentLoader
|
|
79
|
+
return DocumentLoader
|
|
80
|
+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
|
|
81
|
+
|
|
69
82
|
|
|
70
83
|
__all__ = [
|
|
71
84
|
# Version
|
|
@@ -36,6 +36,7 @@ class KnowledgeRetentionMetric(ConversationalMetricPattern):
|
|
|
36
36
|
model: str,
|
|
37
37
|
threshold: float = 0.7,
|
|
38
38
|
temperature: float = 0.5,
|
|
39
|
+
verbose: bool = False
|
|
39
40
|
):
|
|
40
41
|
"""
|
|
41
42
|
Initialize Knowledge Retention metric.
|
|
@@ -45,7 +46,7 @@ class KnowledgeRetentionMetric(ConversationalMetricPattern):
|
|
|
45
46
|
threshold: Success threshold (0.0-1.0)
|
|
46
47
|
temperature: Score aggregation temperature for softmax
|
|
47
48
|
"""
|
|
48
|
-
super().__init__(model=model, threshold=threshold)
|
|
49
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
49
50
|
self.temperature = temperature
|
|
50
51
|
|
|
51
52
|
# ==================== HELPER METHODS ====================
|
|
@@ -214,18 +215,23 @@ Verdicts:
|
|
|
214
215
|
"verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
|
|
215
216
|
"comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
|
|
216
217
|
"final_score": final_score,
|
|
217
|
-
"comment_final_score": f"
|
|
218
|
+
"comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
|
|
218
219
|
"threshold": self.threshold,
|
|
220
|
+
"temperature": self.temperature,
|
|
219
221
|
"success": success,
|
|
220
222
|
"comment_success": "Whether the retention score meets the required threshold.",
|
|
221
223
|
"final_reason": summary,
|
|
222
224
|
"comment_reasoning": "Concise explanation of the assistant's knowledge retention performance."
|
|
223
225
|
}
|
|
224
226
|
|
|
225
|
-
|
|
227
|
+
result = {
|
|
228
|
+
"name": self.name,
|
|
226
229
|
"score": final_score,
|
|
227
230
|
"success": success,
|
|
228
231
|
"reason": summary,
|
|
229
232
|
"evaluation_cost": round(total_cost, 6),
|
|
230
233
|
"evaluation_log": evaluation_log
|
|
231
234
|
}
|
|
235
|
+
self.print_result(result)
|
|
236
|
+
|
|
237
|
+
return result
|
|
@@ -36,6 +36,8 @@ class RoleAdherenceMetric(ConversationalMetricPattern):
|
|
|
36
36
|
model: str,
|
|
37
37
|
threshold: float = 0.7,
|
|
38
38
|
temperature: float = 0.5,
|
|
39
|
+
verbose: bool = False,
|
|
40
|
+
chatbot_role: str = ""
|
|
39
41
|
):
|
|
40
42
|
"""
|
|
41
43
|
Initialize Role Adherence metric.
|
|
@@ -45,8 +47,9 @@ class RoleAdherenceMetric(ConversationalMetricPattern):
|
|
|
45
47
|
threshold: Success threshold (0.0-1.0)
|
|
46
48
|
temperature: Score aggregation temperature for softmax
|
|
47
49
|
"""
|
|
48
|
-
super().__init__(model=model, threshold=threshold)
|
|
50
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
49
51
|
self.temperature = temperature
|
|
52
|
+
self.role_description = chatbot_role
|
|
50
53
|
|
|
51
54
|
# ==================== HELPER METHODS ====================
|
|
52
55
|
|
|
@@ -201,7 +204,7 @@ Return JSON array:
|
|
|
201
204
|
total_cost = 0.0
|
|
202
205
|
|
|
203
206
|
# Step 1: Extract role
|
|
204
|
-
role_description = test_case.chatbot_role or "No role specified"
|
|
207
|
+
role_description = test_case.chatbot_role or self.chatbot_role or "No role specified"
|
|
205
208
|
|
|
206
209
|
# Step 2: Format dialogue
|
|
207
210
|
dialogue_text = self._render_dialogue(test_case.turns)
|
|
@@ -234,18 +237,24 @@ Return JSON array:
|
|
|
234
237
|
"verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
|
|
235
238
|
"comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
|
|
236
239
|
"final_score": final_score,
|
|
237
|
-
"comment_final_score": f"
|
|
240
|
+
"comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
|
|
238
241
|
"threshold": self.threshold,
|
|
242
|
+
"temperature": self.temperature,
|
|
239
243
|
"success": success,
|
|
240
244
|
"comment_success": "Whether the role adherence score meets the required threshold.",
|
|
241
245
|
"final_reason": summary,
|
|
242
246
|
"comment_reasoning": "Concise explanation of how well the assistant maintained its assigned role."
|
|
243
247
|
}
|
|
244
248
|
|
|
245
|
-
|
|
249
|
+
result = {
|
|
250
|
+
"name": self.name,
|
|
246
251
|
"score": final_score,
|
|
247
252
|
"success": success,
|
|
248
253
|
"reason": summary,
|
|
249
254
|
"evaluation_cost": round(total_cost, 6),
|
|
250
255
|
"evaluation_log": evaluation_log
|
|
251
256
|
}
|
|
257
|
+
|
|
258
|
+
self.print_result(result)
|
|
259
|
+
|
|
260
|
+
return result
|
|
@@ -11,6 +11,13 @@ from eval_lib.testcases_schema import ConversationalEvalTestCase
|
|
|
11
11
|
from eval_lib.metric_pattern import ConversationalMetricPattern
|
|
12
12
|
from eval_lib.llm_client import chat_complete
|
|
13
13
|
from eval_lib.utils import score_agg, extract_json_block
|
|
14
|
+
import re
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _contains_links(dialogue: str) -> bool:
|
|
18
|
+
"""Check if dialogue contains any URLs/links"""
|
|
19
|
+
url_pattern = r'https?://[^\s]+|www\.[^\s]+|\[.*?\]\(.*?\)'
|
|
20
|
+
return bool(re.search(url_pattern, dialogue))
|
|
14
21
|
|
|
15
22
|
|
|
16
23
|
# Verdict weights for task completion levels
|
|
@@ -23,7 +30,7 @@ VERDICT_WEIGHTS = {
|
|
|
23
30
|
}
|
|
24
31
|
|
|
25
32
|
# Configuration constants
|
|
26
|
-
MAX_CRITERIA =
|
|
33
|
+
MAX_CRITERIA = 2
|
|
27
34
|
LINK_CRITERION = "The user got the link to the requested resource."
|
|
28
35
|
|
|
29
36
|
|
|
@@ -34,13 +41,13 @@ class TaskSuccessRateMetric(ConversationalMetricPattern):
|
|
|
34
41
|
"""
|
|
35
42
|
|
|
36
43
|
name = "taskSuccessRateMetric"
|
|
37
|
-
template_cls = None
|
|
38
44
|
|
|
39
45
|
def __init__(
|
|
40
46
|
self,
|
|
41
47
|
model: str,
|
|
42
48
|
threshold: float = 0.7,
|
|
43
|
-
temperature: float =
|
|
49
|
+
temperature: float = 0.5,
|
|
50
|
+
verbose: bool = False
|
|
44
51
|
):
|
|
45
52
|
"""
|
|
46
53
|
Initialize Task Success Rate metric.
|
|
@@ -50,7 +57,7 @@ class TaskSuccessRateMetric(ConversationalMetricPattern):
|
|
|
50
57
|
threshold: Success threshold (0.0-1.0)
|
|
51
58
|
temperature: Score aggregation temperature for softmax
|
|
52
59
|
"""
|
|
53
|
-
super().__init__(model=model, threshold=threshold)
|
|
60
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
54
61
|
self.temperature = temperature
|
|
55
62
|
|
|
56
63
|
# ==================== HELPER METHODS ====================
|
|
@@ -118,24 +125,20 @@ Criteria: [
|
|
|
118
125
|
|
|
119
126
|
return text.strip(), cost or 0.0
|
|
120
127
|
|
|
121
|
-
async def _generate_success_criteria(self, goal: str) -> Tuple[List[str], float]:
|
|
128
|
+
async def _generate_success_criteria(self, goal: str, dialogue: str) -> Tuple[List[str], float]:
|
|
122
129
|
"""
|
|
123
130
|
Generate concrete success criteria for the user's goal.
|
|
124
131
|
|
|
125
132
|
Args:
|
|
126
133
|
goal: The inferred user goal
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
Tuple of (criteria_list, llm_cost)
|
|
134
|
+
dialogue: Full conversation text (needed to check for links)
|
|
130
135
|
"""
|
|
131
136
|
prompt = (
|
|
132
137
|
f"{self._prompt_criteria_few_shot()}\n\n"
|
|
133
138
|
f"Now do the same for the next case.\n\n"
|
|
134
139
|
f"User goal: {goal}\n\n"
|
|
135
140
|
f"List up to {MAX_CRITERIA} concrete SUCCESS CRITERIA that could realistically be satisfied "
|
|
136
|
-
f"within a brief chat of 2–5 turns
|
|
137
|
-
"Then **add** this exact sentence: "
|
|
138
|
-
f"\"{LINK_CRITERION}\"\n\n"
|
|
141
|
+
f"within a brief chat of 2–5 turns.\n\n"
|
|
139
142
|
"Each criterion must be a short, observable statement.\n"
|
|
140
143
|
"Return only a JSON array of strings."
|
|
141
144
|
)
|
|
@@ -153,17 +156,10 @@ Criteria: [
|
|
|
153
156
|
if not isinstance(criteria, list):
|
|
154
157
|
raise ValueError("Expected JSON array of criteria")
|
|
155
158
|
|
|
156
|
-
#
|
|
157
|
-
if LINK_CRITERION not in criteria:
|
|
159
|
+
# Add LINK_CRITERION only if dialogue contains links
|
|
160
|
+
if _contains_links(dialogue) and LINK_CRITERION not in criteria:
|
|
158
161
|
criteria.append(LINK_CRITERION)
|
|
159
162
|
|
|
160
|
-
# Keep LINK_CRITERION first and limit to MAX_CRITERIA
|
|
161
|
-
if len(criteria) > MAX_CRITERIA:
|
|
162
|
-
criteria = (
|
|
163
|
-
[LINK_CRITERION] +
|
|
164
|
-
[c for c in criteria if c != LINK_CRITERION][:MAX_CRITERIA - 1]
|
|
165
|
-
)
|
|
166
|
-
|
|
167
163
|
# Truncate to MAX_CRITERIA
|
|
168
164
|
criteria = criteria[:MAX_CRITERIA]
|
|
169
165
|
|
|
@@ -296,7 +292,7 @@ Criteria: [
|
|
|
296
292
|
total_cost += cost
|
|
297
293
|
|
|
298
294
|
# Step 3: Generate success criteria
|
|
299
|
-
success_criteria, cost = await self._generate_success_criteria(user_goal)
|
|
295
|
+
success_criteria, cost = await self._generate_success_criteria(user_goal, dialogue_text)
|
|
300
296
|
total_cost += cost
|
|
301
297
|
|
|
302
298
|
# Step 4: Generate verdicts for each criterion
|
|
@@ -330,18 +326,23 @@ Criteria: [
|
|
|
330
326
|
"verdict_weights": {i: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for i, v in enumerate(verdicts)},
|
|
331
327
|
"comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
|
|
332
328
|
"final_score": final_score,
|
|
333
|
-
"comment_final_score": f"
|
|
329
|
+
"comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
|
|
334
330
|
"threshold": self.threshold,
|
|
331
|
+
"temperature": self.temperature,
|
|
335
332
|
"success": success,
|
|
336
333
|
"comment_success": "Whether the task success score meets the required threshold.",
|
|
337
334
|
"final_reason": summary,
|
|
338
335
|
"comment_reasoning": "Concise explanation of the overall task completion assessment."
|
|
339
336
|
}
|
|
340
337
|
|
|
341
|
-
|
|
338
|
+
result = {
|
|
339
|
+
"name": self.name,
|
|
342
340
|
"score": final_score,
|
|
343
341
|
"success": success,
|
|
344
342
|
"reason": summary,
|
|
345
343
|
"evaluation_cost": round(total_cost, 6),
|
|
346
344
|
"evaluation_log": evaluation_log
|
|
347
345
|
}
|
|
346
|
+
self.print_result(result)
|
|
347
|
+
|
|
348
|
+
return result
|
|
@@ -15,11 +15,12 @@ class ToolCorrectnessMetric(MetricPattern):
|
|
|
15
15
|
def __init__(
|
|
16
16
|
self,
|
|
17
17
|
threshold: float = 0.5,
|
|
18
|
+
verbose: bool = False,
|
|
18
19
|
evaluation_params: List[str] = [],
|
|
19
20
|
should_exact_match: bool = False,
|
|
20
21
|
should_consider_ordering: bool = False
|
|
21
22
|
):
|
|
22
|
-
super().__init__(model=None, threshold=threshold)
|
|
23
|
+
super().__init__(model=None, threshold=threshold, verbose=verbose)
|
|
23
24
|
self.evaluation_params = evaluation_params
|
|
24
25
|
self.should_exact_match = should_exact_match
|
|
25
26
|
self.should_consider_ordering = should_consider_ordering
|
|
@@ -31,13 +32,18 @@ class ToolCorrectnessMetric(MetricPattern):
|
|
|
31
32
|
score = self.calculate_score()
|
|
32
33
|
reason = self.generate_reason()
|
|
33
34
|
|
|
34
|
-
|
|
35
|
+
result = {
|
|
36
|
+
"name": self.name,
|
|
35
37
|
"score": score,
|
|
36
38
|
"success": score >= self.threshold,
|
|
37
39
|
"reason": reason,
|
|
38
40
|
"evaluation_cost": 0.0 # No LLM cost for this metric
|
|
39
41
|
}
|
|
40
42
|
|
|
43
|
+
self.print_result(result)
|
|
44
|
+
|
|
45
|
+
return result
|
|
46
|
+
|
|
41
47
|
def generate_reason(self) -> str:
|
|
42
48
|
called_names = self.tools_called
|
|
43
49
|
expected_names = self.expected_tools
|
|
@@ -9,16 +9,27 @@ from eval_lib.utils import extract_json_block
|
|
|
9
9
|
import asyncio
|
|
10
10
|
import random
|
|
11
11
|
import json
|
|
12
|
+
import time
|
|
13
|
+
|
|
14
|
+
# Colors for beautiful console output
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Colors:
|
|
18
|
+
HEADER = '\033[95m'
|
|
19
|
+
BLUE = '\033[94m'
|
|
20
|
+
CYAN = '\033[96m'
|
|
21
|
+
GREEN = '\033[92m'
|
|
22
|
+
YELLOW = '\033[93m'
|
|
23
|
+
RED = '\033[91m'
|
|
24
|
+
ENDC = '\033[0m'
|
|
25
|
+
BOLD = '\033[1m'
|
|
26
|
+
UNDERLINE = '\033[4m'
|
|
27
|
+
DIM = '\033[2m'
|
|
12
28
|
|
|
13
29
|
|
|
14
30
|
async def retry_async(fn, *args, retries=4, base_delay=0.6, max_delay=6.0,
|
|
15
31
|
retriable_statuses=(429, 500, 502, 503, 504),
|
|
16
32
|
**kwargs):
|
|
17
|
-
"""
|
|
18
|
-
fn — корутина, которая может бросить исключение вида:
|
|
19
|
-
- HTTPException-like с .status_code
|
|
20
|
-
- Exception с текстом, где встречается 'Service Unavailable' и т.п.
|
|
21
|
-
"""
|
|
22
33
|
attempt = 0
|
|
23
34
|
while True:
|
|
24
35
|
try:
|
|
@@ -34,7 +45,6 @@ async def retry_async(fn, *args, retries=4, base_delay=0.6, max_delay=6.0,
|
|
|
34
45
|
if attempt > retries or not retriable:
|
|
35
46
|
raise
|
|
36
47
|
|
|
37
|
-
# экспоненциальный бэкофф + джиттер
|
|
38
48
|
delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
|
|
39
49
|
delay += random.uniform(0, 0.4)
|
|
40
50
|
await asyncio.sleep(delay)
|
|
@@ -61,6 +71,7 @@ class DatasetGenerator:
|
|
|
61
71
|
max_chunks: int = 30,
|
|
62
72
|
relevance_margin: float = 1.5,
|
|
63
73
|
embedding_model: str = "openai:text-embedding-3-small",
|
|
74
|
+
verbose: bool = False,
|
|
64
75
|
):
|
|
65
76
|
self.model = model
|
|
66
77
|
self.input_format = input_format
|
|
@@ -78,8 +89,132 @@ class DatasetGenerator:
|
|
|
78
89
|
self.max_chunks = max_chunks
|
|
79
90
|
self.relevance_margin = relevance_margin
|
|
80
91
|
self.embedding_model = embedding_model
|
|
92
|
+
self.verbose = verbose
|
|
93
|
+
|
|
94
|
+
def _log(self, message: str, color: str = Colors.CYAN):
|
|
95
|
+
"""Log message with color if verbose mode is enabled"""
|
|
96
|
+
if self.verbose:
|
|
97
|
+
print(f"{color}{message}{Colors.ENDC}")
|
|
98
|
+
|
|
99
|
+
def _log_step(self, step_name: str, step_num: int = None):
|
|
100
|
+
"""Log generation step"""
|
|
101
|
+
if self.verbose:
|
|
102
|
+
prefix = f"[{step_num}] " if step_num else ""
|
|
103
|
+
print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
|
|
104
|
+
|
|
105
|
+
def _log_progress(self, current: int, total: int, label: str = "Progress"):
|
|
106
|
+
"""Log progress bar"""
|
|
107
|
+
if self.verbose:
|
|
108
|
+
percentage = (current / total) * 100 if total > 0 else 0
|
|
109
|
+
bar_length = 30
|
|
110
|
+
filled = int(bar_length * current / total) if total > 0 else 0
|
|
111
|
+
bar = '█' * filled + '░' * (bar_length - filled)
|
|
112
|
+
print(
|
|
113
|
+
f"{Colors.CYAN} {label}: [{bar}] {current}/{total} ({percentage:.0f}%){Colors.ENDC}")
|
|
114
|
+
|
|
115
|
+
def _print_header(self, title: str):
|
|
116
|
+
"""Print beautiful header"""
|
|
117
|
+
if self.verbose:
|
|
118
|
+
import shutil
|
|
119
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
120
|
+
WIDTH = terminal_width // 2
|
|
121
|
+
WIDTH = max(WIDTH, 60)
|
|
122
|
+
|
|
123
|
+
border = "═" * WIDTH
|
|
124
|
+
title_text = f"🎯 {title}"
|
|
125
|
+
padding = WIDTH - len(title_text)
|
|
126
|
+
left_pad = padding // 2
|
|
127
|
+
right_pad = padding - left_pad
|
|
128
|
+
centered_title = " " * left_pad + title_text + " " * right_pad
|
|
129
|
+
|
|
130
|
+
print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
|
|
131
|
+
print(
|
|
132
|
+
f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_title}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
|
|
133
|
+
print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
|
|
134
|
+
|
|
135
|
+
def _print_summary(self, dataset: List[dict], elapsed_time: float, total_cost: float = 0.0):
|
|
136
|
+
"""Print generation summary with full dataset in readable format"""
|
|
137
|
+
if not self.verbose:
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
import shutil
|
|
141
|
+
import textwrap
|
|
142
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
143
|
+
WIDTH = terminal_width - 10
|
|
144
|
+
WIDTH = max(WIDTH, 80)
|
|
145
|
+
|
|
146
|
+
print(
|
|
147
|
+
f"\n{Colors.BOLD}{Colors.GREEN}✅ Dataset Generation Complete{Colors.ENDC}\n")
|
|
148
|
+
print(f"{Colors.BOLD}Summary:{Colors.ENDC}")
|
|
149
|
+
print(
|
|
150
|
+
f" 📊 Total rows generated: {Colors.YELLOW}{len(dataset)}{Colors.ENDC}")
|
|
151
|
+
print(
|
|
152
|
+
f" ⏱️ Time elapsed: {Colors.YELLOW}{elapsed_time:.2f}s{Colors.ENDC}")
|
|
153
|
+
if total_cost > 0:
|
|
154
|
+
print(
|
|
155
|
+
f" 💰 Total cost: {Colors.BLUE}${total_cost:.6f}{Colors.ENDC}")
|
|
156
|
+
|
|
157
|
+
# Show full dataset
|
|
158
|
+
if dataset:
|
|
159
|
+
print(f"\n{Colors.BOLD}Generated Dataset:{Colors.ENDC}\n")
|
|
160
|
+
|
|
161
|
+
for idx, row in enumerate(dataset, 1):
|
|
162
|
+
# Header
|
|
163
|
+
print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}")
|
|
164
|
+
print(
|
|
165
|
+
f"{Colors.CYAN}{Colors.BOLD}Row {idx}/{len(dataset)}:{Colors.ENDC}")
|
|
166
|
+
print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}")
|
|
167
|
+
|
|
168
|
+
# Fields
|
|
169
|
+
for key, value in row.items():
|
|
170
|
+
value_str = str(value)
|
|
171
|
+
|
|
172
|
+
# Key with proper formatting
|
|
173
|
+
print(f"{Colors.BOLD}{key}:{Colors.ENDC}", end=" ")
|
|
174
|
+
|
|
175
|
+
# Wrap long text to fit terminal width
|
|
176
|
+
# Calculate available width (WIDTH - key length - 2 for ": ")
|
|
177
|
+
available_width = WIDTH - len(key) - 2
|
|
178
|
+
|
|
179
|
+
if len(value_str) <= available_width:
|
|
180
|
+
# Short value - print on same line
|
|
181
|
+
print(value_str)
|
|
182
|
+
else:
|
|
183
|
+
# Long value - wrap to multiple lines with proper indentation
|
|
184
|
+
print() # New line after key
|
|
185
|
+
wrapped = textwrap.fill(
|
|
186
|
+
value_str,
|
|
187
|
+
width=WIDTH - 2,
|
|
188
|
+
initial_indent=" ",
|
|
189
|
+
subsequent_indent=" ",
|
|
190
|
+
break_long_words=False,
|
|
191
|
+
break_on_hyphens=False
|
|
192
|
+
)
|
|
193
|
+
print(f"{Colors.DIM}{wrapped}{Colors.ENDC}")
|
|
194
|
+
|
|
195
|
+
print() # Spacing after row
|
|
196
|
+
|
|
197
|
+
print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}\n")
|
|
198
|
+
|
|
199
|
+
# Add spacing between rows
|
|
200
|
+
if idx < len(dataset):
|
|
201
|
+
print()
|
|
81
202
|
|
|
82
203
|
async def generate_from_scratch(self) -> List[dict]:
|
|
204
|
+
|
|
205
|
+
start_time = time.time()
|
|
206
|
+
|
|
207
|
+
if self.verbose:
|
|
208
|
+
self._print_header("Dataset Generation from Scratch")
|
|
209
|
+
self._log(f"Configuration:", Colors.BOLD)
|
|
210
|
+
self._log(f" Model: {self.model}")
|
|
211
|
+
self._log(f" Max rows: {self.max_rows}")
|
|
212
|
+
self._log(f" Test types: {', '.join(self.test_types)}")
|
|
213
|
+
self._log(f" Language: {self.language}")
|
|
214
|
+
self._log("")
|
|
215
|
+
|
|
216
|
+
self._log_step("Generating prompt", 1)
|
|
217
|
+
|
|
83
218
|
prompt = dataset_generation_from_scratch_prompt(
|
|
84
219
|
max_rows=self.max_rows,
|
|
85
220
|
agent_description=self.agent_description,
|
|
@@ -92,23 +227,52 @@ class DatasetGenerator:
|
|
|
92
227
|
language=self.language
|
|
93
228
|
)
|
|
94
229
|
|
|
95
|
-
|
|
230
|
+
self._log_step("Calling LLM to generate dataset", 2)
|
|
231
|
+
|
|
232
|
+
raw, cost = await chat_complete(
|
|
96
233
|
llm=self.model,
|
|
97
234
|
messages=[{"role": "user", "content": prompt}],
|
|
98
235
|
temperature=self.temperature,
|
|
99
236
|
)
|
|
100
237
|
|
|
238
|
+
self._log_step("Parsing response", 3)
|
|
239
|
+
|
|
101
240
|
try:
|
|
102
241
|
raw_json = extract_json_block(raw)
|
|
103
242
|
data = json.loads(raw_json)
|
|
104
243
|
assert isinstance(data, list), "not a JSON array"
|
|
244
|
+
elapsed_time = time.time() - start_time
|
|
245
|
+
self._print_summary(data, elapsed_time, cost or 0.0)
|
|
246
|
+
|
|
105
247
|
return data
|
|
106
248
|
except Exception as exc:
|
|
249
|
+
if self.verbose:
|
|
250
|
+
self._log(f"❌ Failed to parse dataset", Colors.RED)
|
|
107
251
|
raise RuntimeError(f"Failed to parse dataset:\n{exc}\n\n{raw}")
|
|
108
252
|
|
|
109
253
|
async def generate_from_documents(self, file_paths: List[str]) -> List[dict]:
|
|
110
|
-
|
|
254
|
+
"""Generate dataset from documents"""
|
|
255
|
+
start_time = time.time()
|
|
256
|
+
total_cost = 0.0
|
|
257
|
+
|
|
258
|
+
if self.verbose:
|
|
259
|
+
self._print_header("Dataset Generation from Documents")
|
|
260
|
+
self._log(f"Configuration:", Colors.BOLD)
|
|
261
|
+
self._log(f" Model: {self.model}")
|
|
262
|
+
self._log(f" Max rows: {self.max_rows}")
|
|
263
|
+
self._log(f" Documents: {len(file_paths)}")
|
|
264
|
+
self._log(f" Chunk size: {self.chunk_size}")
|
|
265
|
+
self._log(f" Test types: {', '.join(self.test_types)}")
|
|
266
|
+
self._log("")
|
|
267
|
+
|
|
268
|
+
self._log_step("Loading documents", 1)
|
|
111
269
|
docs = load_documents(file_paths)
|
|
270
|
+
|
|
271
|
+
if self.verbose:
|
|
272
|
+
self._log(
|
|
273
|
+
f" ✅ Loaded {len(file_paths)} file(s) → {len(docs)} page(s)/document(s)", Colors.GREEN)
|
|
274
|
+
|
|
275
|
+
self._log_step("Chunking documents", 2)
|
|
112
276
|
doc_chunks = chunk_documents(docs,
|
|
113
277
|
chunk_size=self.chunk_size,
|
|
114
278
|
chunk_overlap=self.chunk_overlap)
|
|
@@ -117,8 +281,15 @@ class DatasetGenerator:
|
|
|
117
281
|
if not chunks_text:
|
|
118
282
|
raise ValueError("No text extracted from documents.")
|
|
119
283
|
|
|
284
|
+
if self.verbose:
|
|
285
|
+
self._log(f" ✅ Created {len(chunks_text)} chunks", Colors.GREEN)
|
|
286
|
+
|
|
287
|
+
self._log_step("Ranking chunks by relevance", 3)
|
|
120
288
|
ranked_chunks = await self._rank_chunks_by_relevance(chunks_text)
|
|
121
289
|
|
|
290
|
+
if self.verbose:
|
|
291
|
+
self._log(f" ✅ Ranked {len(ranked_chunks)} chunks", Colors.GREEN)
|
|
292
|
+
|
|
122
293
|
total_chunks = len(ranked_chunks)
|
|
123
294
|
rows_per_chunk = max(1, math.ceil(self.max_rows / total_chunks))
|
|
124
295
|
|
|
@@ -127,11 +298,21 @@ class DatasetGenerator:
|
|
|
127
298
|
self.max_chunks)
|
|
128
299
|
selected_chunks = ranked_chunks[:top_k]
|
|
129
300
|
|
|
301
|
+
if self.verbose:
|
|
302
|
+
self._log(
|
|
303
|
+
f" 📌 Selected top {len(selected_chunks)} chunks for generation", Colors.YELLOW)
|
|
304
|
+
self._log("")
|
|
305
|
+
|
|
130
306
|
dataset: list[dict] = []
|
|
131
307
|
|
|
132
308
|
MAX_PROMPT_CHARS = 24_000
|
|
133
309
|
|
|
134
|
-
|
|
310
|
+
self._log_step(f"Generating dataset from chunks", 4)
|
|
311
|
+
|
|
312
|
+
for i, chunk in enumerate(selected_chunks):
|
|
313
|
+
if self.verbose:
|
|
314
|
+
self._log_progress(
|
|
315
|
+
i + 1, len(selected_chunks), "Processing chunks")
|
|
135
316
|
|
|
136
317
|
safe_chunk = chunk if len(
|
|
137
318
|
chunk) <= MAX_PROMPT_CHARS else chunk[:MAX_PROMPT_CHARS]
|
|
@@ -149,24 +330,39 @@ class DatasetGenerator:
|
|
|
149
330
|
language=self.language
|
|
150
331
|
)
|
|
151
332
|
|
|
152
|
-
raw,
|
|
333
|
+
raw, cost = await retry_async(
|
|
153
334
|
chat_complete,
|
|
154
335
|
llm=self.model,
|
|
155
336
|
messages=[{"role": "user", "content": prompt}],
|
|
156
337
|
temperature=self.temperature,
|
|
157
338
|
)
|
|
158
339
|
|
|
340
|
+
total_cost += cost or 0.0
|
|
341
|
+
|
|
159
342
|
try:
|
|
160
343
|
chunk_data = json.loads(extract_json_block(raw))
|
|
161
344
|
assert isinstance(chunk_data, list)
|
|
162
345
|
dataset.extend(chunk_data)
|
|
346
|
+
|
|
347
|
+
if self.verbose:
|
|
348
|
+
self._log(
|
|
349
|
+
f" ✅ Generated {len(chunk_data)} rows from chunk {i+1}", Colors.GREEN)
|
|
350
|
+
|
|
163
351
|
except Exception as exc:
|
|
164
|
-
|
|
352
|
+
if self.verbose:
|
|
353
|
+
self._log(
|
|
354
|
+
f" ⚠️ Chunk {i+1} parsing failed, skipping", Colors.YELLOW)
|
|
355
|
+
continue
|
|
165
356
|
|
|
166
357
|
if len(dataset) >= self.max_rows:
|
|
167
358
|
break
|
|
168
359
|
|
|
169
|
-
|
|
360
|
+
final_dataset = dataset[: self.max_rows]
|
|
361
|
+
elapsed_time = time.time() - start_time
|
|
362
|
+
|
|
363
|
+
self._print_summary(final_dataset, elapsed_time, total_cost)
|
|
364
|
+
|
|
365
|
+
return final_dataset
|
|
170
366
|
|
|
171
367
|
async def _rank_chunks_by_relevance(self, chunks: list[str]) -> list[str]:
|
|
172
368
|
"""
|