eval-ai-library 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (29) hide show
  1. eval_ai_library-0.3.0.dist-info/METADATA +1042 -0
  2. eval_ai_library-0.3.0.dist-info/RECORD +34 -0
  3. eval_lib/__init__.py +19 -6
  4. eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +8 -3
  5. eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +12 -4
  6. eval_lib/agent_metrics/task_success_metric/task_success_rate.py +23 -23
  7. eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
  8. eval_lib/datagenerator/datagenerator.py +208 -12
  9. eval_lib/datagenerator/document_loader.py +29 -29
  10. eval_lib/evaluate.py +0 -22
  11. eval_lib/llm_client.py +223 -78
  12. eval_lib/metric_pattern.py +208 -152
  13. eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
  14. eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +7 -2
  15. eval_lib/metrics/bias_metric/bias.py +12 -2
  16. eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
  17. eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
  18. eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +8 -2
  19. eval_lib/metrics/custom_metric/custom_eval.py +237 -204
  20. eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
  21. eval_lib/metrics/geval/geval.py +8 -2
  22. eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
  23. eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
  24. eval_lib/utils.py +44 -29
  25. eval_ai_library-0.2.1.dist-info/METADATA +0 -753
  26. eval_ai_library-0.2.1.dist-info/RECORD +0 -34
  27. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/WHEEL +0 -0
  28. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/licenses/LICENSE +0 -0
  29. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,34 @@
1
+ eval_ai_library-0.3.0.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
+ eval_lib/__init__.py,sha256=BA0vmdi5_3Zd1ib8nLyq6pESBVnFX7aKdvRAbR_I9bQ,3029
3
+ eval_lib/evaluate.py,sha256=GjlXZb5dnl44LCaJwdkyGCYcC50zoNZn3NrofzNAVJ0,11490
4
+ eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
5
+ eval_lib/llm_client.py,sha256=6uNLIAfSdj4u_n4SAk3UkYmxcfO2Y2y-9Kp0T4bSjPM,13845
6
+ eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
7
+ eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
8
+ eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
9
+ eval_lib/testcases_schema.py,sha256=qI4o6kX0jH1DR3sHGXUnu3Cyt2oq7rGlsMlOaXSt6F4,696
10
+ eval_lib/utils.py,sha256=-hwagFFn3_QjgyLqF8Qx7JIkpgOEI8-F14eycog3bgc,3141
11
+ eval_lib/agent_metrics/__init__.py,sha256=20Y4BsicD2s7OkOBQPBvB2JKStBDtplv52_q6q35Vgo,525
12
+ eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=_9MRWbv8oi6goY-ZXWu7FaNV0vQX1UD1w1Ar7CPVino,8244
13
+ eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=D9MuKUI8ujh766vF8VQO73m-fPSQxy5u2Tkcr3wHbVk,9180
14
+ eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=mt1PRHadup2k64gF_OyL8d-eQ5zm2EeNwztcE-aGOe0,12452
15
+ eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py,sha256=qbVMtD6EWKah27FogLEiEh6pBX-k2wwKbwM_kFkvYeQ,4220
16
+ eval_lib/datagenerator/datagenerator.py,sha256=NQZIQuSCmryxIT3lTLS1PpJjENmGqARtR-zTiQ8OvRk,15513
17
+ eval_lib/datagenerator/document_loader.py,sha256=vnQUz_Dxb3SxcVPUmMXZe-rgfPp6OfHb6D2Ie9iqPms,17025
18
+ eval_lib/datagenerator/prompts.py,sha256=iQAYitAbLud3vWJnXGh_OCF4syWYS_S71zZEPI6qYAU,7213
19
+ eval_lib/metrics/__init__.py,sha256=3qClCCjPXt5i0c38g5krfuQnqlAXEl-jhAHy1C_ICMY,1213
20
+ eval_lib/metrics/answer_precision_metric/answer_precision.py,sha256=AxPmwzGFU7tnTrrZuQZ7ow4nNSD-blDHdAGwhMHMxjM,15040
21
+ eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=LHf7FI9dYzZcKBdxUH9VpsOceRLh0NMb79qliZRE3Uo,8477
22
+ eval_lib/metrics/bias_metric/bias.py,sha256=BVH8xlTUTRfVG_F1kauwpGAkVKBkUWhM9rUsrrLhpRU,4020
23
+ eval_lib/metrics/contextual_precision_metric/contextual_precision.py,sha256=CQOb6uR2KeffTkhPSqZae56sX5tXMr0pJVM5W_wU1fU,3993
24
+ eval_lib/metrics/contextual_recall_metric/contextual_recall.py,sha256=iw73_hGLWklHZSBkCRkPDNUt1xD5dknA_7CZ6Efkf5w,3913
25
+ eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=pjyQPVDzWfTnqTgy5D1OYq1KdsCP1bVeUg3rcV8El4I,6501
26
+ eval_lib/metrics/custom_metric/custom_eval.py,sha256=GrOo39DxHzdMUGnBnCiHfT7Y8VabFFgmYQGcnkyGm1w,11340
27
+ eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK7Tq-UAJXNql8VKjgtuqlDhA,5930
28
+ eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
29
+ eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
30
+ eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
31
+ eval_ai_library-0.3.0.dist-info/METADATA,sha256=s7bRUm49crFB9ICrQaPnJvWHVQ28YUBMlTtlL0L2dWQ,37706
32
+ eval_ai_library-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ eval_ai_library-0.3.0.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
34
+ eval_ai_library-0.3.0.dist-info/RECORD,,
eval_lib/__init__.py CHANGED
@@ -1,3 +1,5 @@
1
+ # eval_lib/__init__.py
2
+
1
3
  """
2
4
  Eval AI Library - Comprehensive AI Model Evaluation Framework
3
5
 
@@ -5,12 +7,12 @@ A powerful library for evaluating AI models with support for multiple LLM provid
5
7
  and a wide range of evaluation metrics for RAG systems and AI agents.
6
8
  """
7
9
 
8
- __version__ = "0.1.0"
9
- __author__ = "Aleksandr Meskov"
10
+ __version__ = "0.3.0"
11
+ __author__ = "Aleksandr Meshkov"
10
12
 
11
13
  # Core evaluation functions
12
14
  from eval_lib.evaluate import evaluate, evaluate_conversations
13
- from eval_lib.utils import score_agg
15
+ from eval_lib.utils import score_agg, extract_json_block
14
16
 
15
17
  # Test case schemas
16
18
  from eval_lib.testcases_schema import (
@@ -63,9 +65,20 @@ from eval_lib.agent_metrics import (
63
65
  KnowledgeRetentionMetric
64
66
  )
65
67
 
66
- # Data generator
67
- from eval_lib.datagenerator.datagenerator import DataGenerator
68
- from eval_lib.datagenerator.document_loader import DocumentLoader
68
+
69
+ def __getattr__(name):
70
+ """
71
+ Ленивый импорт для модулей с тяжёлыми зависимостями.
72
+ DataGenerator импортируется только когда реально используется.
73
+ """
74
+ if name == "DataGenerator":
75
+ from eval_lib.datagenerator.datagenerator import DataGenerator
76
+ return DataGenerator
77
+ if name == "DocumentLoader":
78
+ from eval_lib.datagenerator.document_loader import DocumentLoader
79
+ return DocumentLoader
80
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
81
+
69
82
 
70
83
  __all__ = [
71
84
  # Version
@@ -36,6 +36,7 @@ class KnowledgeRetentionMetric(ConversationalMetricPattern):
36
36
  model: str,
37
37
  threshold: float = 0.7,
38
38
  temperature: float = 0.5,
39
+ verbose: bool = False
39
40
  ):
40
41
  """
41
42
  Initialize Knowledge Retention metric.
@@ -45,7 +46,7 @@ class KnowledgeRetentionMetric(ConversationalMetricPattern):
45
46
  threshold: Success threshold (0.0-1.0)
46
47
  temperature: Score aggregation temperature for softmax
47
48
  """
48
- super().__init__(model=model, threshold=threshold)
49
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
49
50
  self.temperature = temperature
50
51
 
51
52
  # ==================== HELPER METHODS ====================
@@ -214,7 +215,7 @@ Verdicts:
214
215
  "verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
215
216
  "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
216
217
  "final_score": final_score,
217
- "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
218
+ "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
218
219
  "threshold": self.threshold,
219
220
  "success": success,
220
221
  "comment_success": "Whether the retention score meets the required threshold.",
@@ -222,10 +223,14 @@ Verdicts:
222
223
  "comment_reasoning": "Concise explanation of the assistant's knowledge retention performance."
223
224
  }
224
225
 
225
- return {
226
+ result = {
227
+ "name": self.name,
226
228
  "score": final_score,
227
229
  "success": success,
228
230
  "reason": summary,
229
231
  "evaluation_cost": round(total_cost, 6),
230
232
  "evaluation_log": evaluation_log
231
233
  }
234
+ self.print_result(result)
235
+
236
+ return result
@@ -36,6 +36,8 @@ class RoleAdherenceMetric(ConversationalMetricPattern):
36
36
  model: str,
37
37
  threshold: float = 0.7,
38
38
  temperature: float = 0.5,
39
+ verbose: bool = False,
40
+ chatbot_role: str = ""
39
41
  ):
40
42
  """
41
43
  Initialize Role Adherence metric.
@@ -45,8 +47,9 @@ class RoleAdherenceMetric(ConversationalMetricPattern):
45
47
  threshold: Success threshold (0.0-1.0)
46
48
  temperature: Score aggregation temperature for softmax
47
49
  """
48
- super().__init__(model=model, threshold=threshold)
50
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
49
51
  self.temperature = temperature
52
+ self.role_description = chatbot_role
50
53
 
51
54
  # ==================== HELPER METHODS ====================
52
55
 
@@ -201,7 +204,7 @@ Return JSON array:
201
204
  total_cost = 0.0
202
205
 
203
206
  # Step 1: Extract role
204
- role_description = test_case.chatbot_role or "No role specified"
207
+ role_description = test_case.chatbot_role or self.chatbot_role or "No role specified"
205
208
 
206
209
  # Step 2: Format dialogue
207
210
  dialogue_text = self._render_dialogue(test_case.turns)
@@ -234,7 +237,7 @@ Return JSON array:
234
237
  "verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
235
238
  "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
236
239
  "final_score": final_score,
237
- "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
240
+ "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
238
241
  "threshold": self.threshold,
239
242
  "success": success,
240
243
  "comment_success": "Whether the role adherence score meets the required threshold.",
@@ -242,10 +245,15 @@ Return JSON array:
242
245
  "comment_reasoning": "Concise explanation of how well the assistant maintained its assigned role."
243
246
  }
244
247
 
245
- return {
248
+ result = {
249
+ "name": self.name,
246
250
  "score": final_score,
247
251
  "success": success,
248
252
  "reason": summary,
249
253
  "evaluation_cost": round(total_cost, 6),
250
254
  "evaluation_log": evaluation_log
251
255
  }
256
+
257
+ self.print_result(result)
258
+
259
+ return result
@@ -11,6 +11,13 @@ from eval_lib.testcases_schema import ConversationalEvalTestCase
11
11
  from eval_lib.metric_pattern import ConversationalMetricPattern
12
12
  from eval_lib.llm_client import chat_complete
13
13
  from eval_lib.utils import score_agg, extract_json_block
14
+ import re
15
+
16
+
17
+ def _contains_links(dialogue: str) -> bool:
18
+ """Check if dialogue contains any URLs/links"""
19
+ url_pattern = r'https?://[^\s]+|www\.[^\s]+|\[.*?\]\(.*?\)'
20
+ return bool(re.search(url_pattern, dialogue))
14
21
 
15
22
 
16
23
  # Verdict weights for task completion levels
@@ -23,7 +30,7 @@ VERDICT_WEIGHTS = {
23
30
  }
24
31
 
25
32
  # Configuration constants
26
- MAX_CRITERIA = 10
33
+ MAX_CRITERIA = 2
27
34
  LINK_CRITERION = "The user got the link to the requested resource."
28
35
 
29
36
 
@@ -34,13 +41,13 @@ class TaskSuccessRateMetric(ConversationalMetricPattern):
34
41
  """
35
42
 
36
43
  name = "taskSuccessRateMetric"
37
- template_cls = None
38
44
 
39
45
  def __init__(
40
46
  self,
41
47
  model: str,
42
48
  threshold: float = 0.7,
43
- temperature: float = 1.1,
49
+ temperature: float = 0.5,
50
+ verbose: bool = False
44
51
  ):
45
52
  """
46
53
  Initialize Task Success Rate metric.
@@ -50,7 +57,7 @@ class TaskSuccessRateMetric(ConversationalMetricPattern):
50
57
  threshold: Success threshold (0.0-1.0)
51
58
  temperature: Score aggregation temperature for softmax
52
59
  """
53
- super().__init__(model=model, threshold=threshold)
60
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
54
61
  self.temperature = temperature
55
62
 
56
63
  # ==================== HELPER METHODS ====================
@@ -118,24 +125,20 @@ Criteria: [
118
125
 
119
126
  return text.strip(), cost or 0.0
120
127
 
121
- async def _generate_success_criteria(self, goal: str) -> Tuple[List[str], float]:
128
+ async def _generate_success_criteria(self, goal: str, dialogue: str) -> Tuple[List[str], float]:
122
129
  """
123
130
  Generate concrete success criteria for the user's goal.
124
131
 
125
132
  Args:
126
133
  goal: The inferred user goal
127
-
128
- Returns:
129
- Tuple of (criteria_list, llm_cost)
134
+ dialogue: Full conversation text (needed to check for links)
130
135
  """
131
136
  prompt = (
132
137
  f"{self._prompt_criteria_few_shot()}\n\n"
133
138
  f"Now do the same for the next case.\n\n"
134
139
  f"User goal: {goal}\n\n"
135
140
  f"List up to {MAX_CRITERIA} concrete SUCCESS CRITERIA that could realistically be satisfied "
136
- f"within a brief chat of 2–5 turns. "
137
- "Then **add** this exact sentence: "
138
- f"\"{LINK_CRITERION}\"\n\n"
141
+ f"within a brief chat of 2–5 turns.\n\n"
139
142
  "Each criterion must be a short, observable statement.\n"
140
143
  "Return only a JSON array of strings."
141
144
  )
@@ -153,17 +156,10 @@ Criteria: [
153
156
  if not isinstance(criteria, list):
154
157
  raise ValueError("Expected JSON array of criteria")
155
158
 
156
- # Ensure LINK_CRITERION is included
157
- if LINK_CRITERION not in criteria:
159
+ # Add LINK_CRITERION only if dialogue contains links
160
+ if _contains_links(dialogue) and LINK_CRITERION not in criteria:
158
161
  criteria.append(LINK_CRITERION)
159
162
 
160
- # Keep LINK_CRITERION first and limit to MAX_CRITERIA
161
- if len(criteria) > MAX_CRITERIA:
162
- criteria = (
163
- [LINK_CRITERION] +
164
- [c for c in criteria if c != LINK_CRITERION][:MAX_CRITERIA - 1]
165
- )
166
-
167
163
  # Truncate to MAX_CRITERIA
168
164
  criteria = criteria[:MAX_CRITERIA]
169
165
 
@@ -296,7 +292,7 @@ Criteria: [
296
292
  total_cost += cost
297
293
 
298
294
  # Step 3: Generate success criteria
299
- success_criteria, cost = await self._generate_success_criteria(user_goal)
295
+ success_criteria, cost = await self._generate_success_criteria(user_goal, dialogue_text)
300
296
  total_cost += cost
301
297
 
302
298
  # Step 4: Generate verdicts for each criterion
@@ -330,7 +326,7 @@ Criteria: [
330
326
  "verdict_weights": {i: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for i, v in enumerate(verdicts)},
331
327
  "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
332
328
  "final_score": final_score,
333
- "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
329
+ "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
334
330
  "threshold": self.threshold,
335
331
  "success": success,
336
332
  "comment_success": "Whether the task success score meets the required threshold.",
@@ -338,10 +334,14 @@ Criteria: [
338
334
  "comment_reasoning": "Concise explanation of the overall task completion assessment."
339
335
  }
340
336
 
341
- return {
337
+ result = {
338
+ "name": self.name,
342
339
  "score": final_score,
343
340
  "success": success,
344
341
  "reason": summary,
345
342
  "evaluation_cost": round(total_cost, 6),
346
343
  "evaluation_log": evaluation_log
347
344
  }
345
+ self.print_result(result)
346
+
347
+ return result
@@ -15,11 +15,12 @@ class ToolCorrectnessMetric(MetricPattern):
15
15
  def __init__(
16
16
  self,
17
17
  threshold: float = 0.5,
18
+ verbose: bool = False,
18
19
  evaluation_params: List[str] = [],
19
20
  should_exact_match: bool = False,
20
21
  should_consider_ordering: bool = False
21
22
  ):
22
- super().__init__(model=None, threshold=threshold)
23
+ super().__init__(model=None, threshold=threshold, verbose=verbose)
23
24
  self.evaluation_params = evaluation_params
24
25
  self.should_exact_match = should_exact_match
25
26
  self.should_consider_ordering = should_consider_ordering
@@ -31,13 +32,18 @@ class ToolCorrectnessMetric(MetricPattern):
31
32
  score = self.calculate_score()
32
33
  reason = self.generate_reason()
33
34
 
34
- return {
35
+ result = {
36
+ "name": self.name,
35
37
  "score": score,
36
38
  "success": score >= self.threshold,
37
39
  "reason": reason,
38
40
  "evaluation_cost": 0.0 # No LLM cost for this metric
39
41
  }
40
42
 
43
+ self.print_result(result)
44
+
45
+ return result
46
+
41
47
  def generate_reason(self) -> str:
42
48
  called_names = self.tools_called
43
49
  expected_names = self.expected_tools
@@ -9,16 +9,27 @@ from eval_lib.utils import extract_json_block
9
9
  import asyncio
10
10
  import random
11
11
  import json
12
+ import time
13
+
14
+ # Colors for beautiful console output
15
+
16
+
17
+ class Colors:
18
+ HEADER = '\033[95m'
19
+ BLUE = '\033[94m'
20
+ CYAN = '\033[96m'
21
+ GREEN = '\033[92m'
22
+ YELLOW = '\033[93m'
23
+ RED = '\033[91m'
24
+ ENDC = '\033[0m'
25
+ BOLD = '\033[1m'
26
+ UNDERLINE = '\033[4m'
27
+ DIM = '\033[2m'
12
28
 
13
29
 
14
30
  async def retry_async(fn, *args, retries=4, base_delay=0.6, max_delay=6.0,
15
31
  retriable_statuses=(429, 500, 502, 503, 504),
16
32
  **kwargs):
17
- """
18
- fn — корутина, которая может бросить исключение вида:
19
- - HTTPException-like с .status_code
20
- - Exception с текстом, где встречается 'Service Unavailable' и т.п.
21
- """
22
33
  attempt = 0
23
34
  while True:
24
35
  try:
@@ -34,7 +45,6 @@ async def retry_async(fn, *args, retries=4, base_delay=0.6, max_delay=6.0,
34
45
  if attempt > retries or not retriable:
35
46
  raise
36
47
 
37
- # экспоненциальный бэкофф + джиттер
38
48
  delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
39
49
  delay += random.uniform(0, 0.4)
40
50
  await asyncio.sleep(delay)
@@ -61,6 +71,7 @@ class DatasetGenerator:
61
71
  max_chunks: int = 30,
62
72
  relevance_margin: float = 1.5,
63
73
  embedding_model: str = "openai:text-embedding-3-small",
74
+ verbose: bool = False,
64
75
  ):
65
76
  self.model = model
66
77
  self.input_format = input_format
@@ -78,8 +89,132 @@ class DatasetGenerator:
78
89
  self.max_chunks = max_chunks
79
90
  self.relevance_margin = relevance_margin
80
91
  self.embedding_model = embedding_model
92
+ self.verbose = verbose
93
+
94
+ def _log(self, message: str, color: str = Colors.CYAN):
95
+ """Log message with color if verbose mode is enabled"""
96
+ if self.verbose:
97
+ print(f"{color}{message}{Colors.ENDC}")
98
+
99
+ def _log_step(self, step_name: str, step_num: int = None):
100
+ """Log generation step"""
101
+ if self.verbose:
102
+ prefix = f"[{step_num}] " if step_num else ""
103
+ print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
104
+
105
+ def _log_progress(self, current: int, total: int, label: str = "Progress"):
106
+ """Log progress bar"""
107
+ if self.verbose:
108
+ percentage = (current / total) * 100 if total > 0 else 0
109
+ bar_length = 30
110
+ filled = int(bar_length * current / total) if total > 0 else 0
111
+ bar = '█' * filled + '░' * (bar_length - filled)
112
+ print(
113
+ f"{Colors.CYAN} {label}: [{bar}] {current}/{total} ({percentage:.0f}%){Colors.ENDC}")
114
+
115
+ def _print_header(self, title: str):
116
+ """Print beautiful header"""
117
+ if self.verbose:
118
+ import shutil
119
+ terminal_width = shutil.get_terminal_size().columns
120
+ WIDTH = terminal_width // 2
121
+ WIDTH = max(WIDTH, 60)
122
+
123
+ border = "═" * WIDTH
124
+ title_text = f"🎯 {title}"
125
+ padding = WIDTH - len(title_text)
126
+ left_pad = padding // 2
127
+ right_pad = padding - left_pad
128
+ centered_title = " " * left_pad + title_text + " " * right_pad
129
+
130
+ print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
131
+ print(
132
+ f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_title}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
133
+ print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
134
+
135
+ def _print_summary(self, dataset: List[dict], elapsed_time: float, total_cost: float = 0.0):
136
+ """Print generation summary with full dataset in readable format"""
137
+ if not self.verbose:
138
+ return
139
+
140
+ import shutil
141
+ import textwrap
142
+ terminal_width = shutil.get_terminal_size().columns
143
+ WIDTH = terminal_width - 10
144
+ WIDTH = max(WIDTH, 80)
145
+
146
+ print(
147
+ f"\n{Colors.BOLD}{Colors.GREEN}✅ Dataset Generation Complete{Colors.ENDC}\n")
148
+ print(f"{Colors.BOLD}Summary:{Colors.ENDC}")
149
+ print(
150
+ f" 📊 Total rows generated: {Colors.YELLOW}{len(dataset)}{Colors.ENDC}")
151
+ print(
152
+ f" ⏱️ Time elapsed: {Colors.YELLOW}{elapsed_time:.2f}s{Colors.ENDC}")
153
+ if total_cost > 0:
154
+ print(
155
+ f" 💰 Total cost: {Colors.BLUE}${total_cost:.6f}{Colors.ENDC}")
156
+
157
+ # Show full dataset
158
+ if dataset:
159
+ print(f"\n{Colors.BOLD}Generated Dataset:{Colors.ENDC}\n")
160
+
161
+ for idx, row in enumerate(dataset, 1):
162
+ # Header
163
+ print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}")
164
+ print(
165
+ f"{Colors.CYAN}{Colors.BOLD}Row {idx}/{len(dataset)}:{Colors.ENDC}")
166
+ print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}")
167
+
168
+ # Fields
169
+ for key, value in row.items():
170
+ value_str = str(value)
171
+
172
+ # Key with proper formatting
173
+ print(f"{Colors.BOLD}{key}:{Colors.ENDC}", end=" ")
174
+
175
+ # Wrap long text to fit terminal width
176
+ # Calculate available width (WIDTH - key length - 2 for ": ")
177
+ available_width = WIDTH - len(key) - 2
178
+
179
+ if len(value_str) <= available_width:
180
+ # Short value - print on same line
181
+ print(value_str)
182
+ else:
183
+ # Long value - wrap to multiple lines with proper indentation
184
+ print() # New line after key
185
+ wrapped = textwrap.fill(
186
+ value_str,
187
+ width=WIDTH - 2,
188
+ initial_indent=" ",
189
+ subsequent_indent=" ",
190
+ break_long_words=False,
191
+ break_on_hyphens=False
192
+ )
193
+ print(f"{Colors.DIM}{wrapped}{Colors.ENDC}")
194
+
195
+ print() # Spacing after row
196
+
197
+ print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}\n")
198
+
199
+ # Add spacing between rows
200
+ if idx < len(dataset):
201
+ print()
81
202
 
82
203
  async def generate_from_scratch(self) -> List[dict]:
204
+
205
+ start_time = time.time()
206
+
207
+ if self.verbose:
208
+ self._print_header("Dataset Generation from Scratch")
209
+ self._log(f"Configuration:", Colors.BOLD)
210
+ self._log(f" Model: {self.model}")
211
+ self._log(f" Max rows: {self.max_rows}")
212
+ self._log(f" Test types: {', '.join(self.test_types)}")
213
+ self._log(f" Language: {self.language}")
214
+ self._log("")
215
+
216
+ self._log_step("Generating prompt", 1)
217
+
83
218
  prompt = dataset_generation_from_scratch_prompt(
84
219
  max_rows=self.max_rows,
85
220
  agent_description=self.agent_description,
@@ -92,23 +227,52 @@ class DatasetGenerator:
92
227
  language=self.language
93
228
  )
94
229
 
95
- raw, _ = await chat_complete(
230
+ self._log_step("Calling LLM to generate dataset", 2)
231
+
232
+ raw, cost = await chat_complete(
96
233
  llm=self.model,
97
234
  messages=[{"role": "user", "content": prompt}],
98
235
  temperature=self.temperature,
99
236
  )
100
237
 
238
+ self._log_step("Parsing response", 3)
239
+
101
240
  try:
102
241
  raw_json = extract_json_block(raw)
103
242
  data = json.loads(raw_json)
104
243
  assert isinstance(data, list), "not a JSON array"
244
+ elapsed_time = time.time() - start_time
245
+ self._print_summary(data, elapsed_time, cost or 0.0)
246
+
105
247
  return data
106
248
  except Exception as exc:
249
+ if self.verbose:
250
+ self._log(f"❌ Failed to parse dataset", Colors.RED)
107
251
  raise RuntimeError(f"Failed to parse dataset:\n{exc}\n\n{raw}")
108
252
 
109
253
  async def generate_from_documents(self, file_paths: List[str]) -> List[dict]:
110
-
254
+ """Generate dataset from documents"""
255
+ start_time = time.time()
256
+ total_cost = 0.0
257
+
258
+ if self.verbose:
259
+ self._print_header("Dataset Generation from Documents")
260
+ self._log(f"Configuration:", Colors.BOLD)
261
+ self._log(f" Model: {self.model}")
262
+ self._log(f" Max rows: {self.max_rows}")
263
+ self._log(f" Documents: {len(file_paths)}")
264
+ self._log(f" Chunk size: {self.chunk_size}")
265
+ self._log(f" Test types: {', '.join(self.test_types)}")
266
+ self._log("")
267
+
268
+ self._log_step("Loading documents", 1)
111
269
  docs = load_documents(file_paths)
270
+
271
+ if self.verbose:
272
+ self._log(
273
+ f" ✅ Loaded {len(file_paths)} file(s) → {len(docs)} page(s)/document(s)", Colors.GREEN)
274
+
275
+ self._log_step("Chunking documents", 2)
112
276
  doc_chunks = chunk_documents(docs,
113
277
  chunk_size=self.chunk_size,
114
278
  chunk_overlap=self.chunk_overlap)
@@ -117,8 +281,15 @@ class DatasetGenerator:
117
281
  if not chunks_text:
118
282
  raise ValueError("No text extracted from documents.")
119
283
 
284
+ if self.verbose:
285
+ self._log(f" ✅ Created {len(chunks_text)} chunks", Colors.GREEN)
286
+
287
+ self._log_step("Ranking chunks by relevance", 3)
120
288
  ranked_chunks = await self._rank_chunks_by_relevance(chunks_text)
121
289
 
290
+ if self.verbose:
291
+ self._log(f" ✅ Ranked {len(ranked_chunks)} chunks", Colors.GREEN)
292
+
122
293
  total_chunks = len(ranked_chunks)
123
294
  rows_per_chunk = max(1, math.ceil(self.max_rows / total_chunks))
124
295
 
@@ -127,11 +298,21 @@ class DatasetGenerator:
127
298
  self.max_chunks)
128
299
  selected_chunks = ranked_chunks[:top_k]
129
300
 
301
+ if self.verbose:
302
+ self._log(
303
+ f" 📌 Selected top {len(selected_chunks)} chunks for generation", Colors.YELLOW)
304
+ self._log("")
305
+
130
306
  dataset: list[dict] = []
131
307
 
132
308
  MAX_PROMPT_CHARS = 24_000
133
309
 
134
- for chunk in selected_chunks:
310
+ self._log_step(f"Generating dataset from chunks", 4)
311
+
312
+ for i, chunk in enumerate(selected_chunks):
313
+ if self.verbose:
314
+ self._log_progress(
315
+ i + 1, len(selected_chunks), "Processing chunks")
135
316
 
136
317
  safe_chunk = chunk if len(
137
318
  chunk) <= MAX_PROMPT_CHARS else chunk[:MAX_PROMPT_CHARS]
@@ -149,24 +330,39 @@ class DatasetGenerator:
149
330
  language=self.language
150
331
  )
151
332
 
152
- raw, _ = await retry_async(
333
+ raw, cost = await retry_async(
153
334
  chat_complete,
154
335
  llm=self.model,
155
336
  messages=[{"role": "user", "content": prompt}],
156
337
  temperature=self.temperature,
157
338
  )
158
339
 
340
+ total_cost += cost or 0.0
341
+
159
342
  try:
160
343
  chunk_data = json.loads(extract_json_block(raw))
161
344
  assert isinstance(chunk_data, list)
162
345
  dataset.extend(chunk_data)
346
+
347
+ if self.verbose:
348
+ self._log(
349
+ f" ✅ Generated {len(chunk_data)} rows from chunk {i+1}", Colors.GREEN)
350
+
163
351
  except Exception as exc:
164
- raise RuntimeError(f"Chunk parsing error:\n{exc}\n\n{raw}")
352
+ if self.verbose:
353
+ self._log(
354
+ f" ⚠️ Chunk {i+1} parsing failed, skipping", Colors.YELLOW)
355
+ continue
165
356
 
166
357
  if len(dataset) >= self.max_rows:
167
358
  break
168
359
 
169
- return dataset[: self.max_rows]
360
+ final_dataset = dataset[: self.max_rows]
361
+ elapsed_time = time.time() - start_time
362
+
363
+ self._print_summary(final_dataset, elapsed_time, total_cost)
364
+
365
+ return final_dataset
170
366
 
171
367
  async def _rank_chunks_by_relevance(self, chunks: list[str]) -> list[str]:
172
368
  """