PyPI - eval-ai-library - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

eval-ai-library 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eval-ai-library might be problematic. Click here for more details.

Files changed (29) hide show

eval_ai_library-0.3.1.dist-info/METADATA +1042 -0
eval_ai_library-0.3.1.dist-info/RECORD +34 -0
eval_lib/__init__.py +19 -6
eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +9 -3
eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +13 -4
eval_lib/agent_metrics/task_success_metric/task_success_rate.py +24 -23
eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
eval_lib/datagenerator/datagenerator.py +208 -12
eval_lib/datagenerator/document_loader.py +29 -29
eval_lib/evaluate.py +0 -22
eval_lib/llm_client.py +221 -78
eval_lib/metric_pattern.py +208 -152
eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +8 -2
eval_lib/metrics/bias_metric/bias.py +12 -2
eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +9 -2
eval_lib/metrics/custom_metric/custom_eval.py +238 -204
eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
eval_lib/metrics/geval/geval.py +8 -2
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
eval_lib/utils.py +44 -29
eval_ai_library-0.2.2.dist-info/METADATA +0 -779
eval_ai_library-0.2.2.dist-info/RECORD +0 -34
{eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/WHEEL +0 -0
{eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/licenses/LICENSE +0 -0
{eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.1.dist-info}/top_level.txt +0 -0

eval_ai_library-0.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,34 @@
+eval_ai_library-0.3.1.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
+eval_lib/__init__.py,sha256=Jayvtz47_-0POIspT_LJKZ6jmWyf0fQc9fqQ5KvdPRI,3029
+eval_lib/evaluate.py,sha256=GjlXZb5dnl44LCaJwdkyGCYcC50zoNZn3NrofzNAVJ0,11490
+eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
+eval_lib/llm_client.py,sha256=3eMcarKLkDLDVh4AOxgWbaIzXlzpqsmEfJXNTBonNic,13633
+eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
+eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
+eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
+eval_lib/testcases_schema.py,sha256=qI4o6kX0jH1DR3sHGXUnu3Cyt2oq7rGlsMlOaXSt6F4,696
+eval_lib/utils.py,sha256=-hwagFFn3_QjgyLqF8Qx7JIkpgOEI8-F14eycog3bgc,3141
+eval_lib/agent_metrics/__init__.py,sha256=20Y4BsicD2s7OkOBQPBvB2JKStBDtplv52_q6q35Vgo,525
+eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=7j89HOTsu0rMoFnznTjMl-tqQpnZlS6ZIdrHPueEbb8,8289
+eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=kJsYj9H3W3Mw2iBqj3Br_glP8gU6_diFPiJhRSnHGxg,9225
+eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=v5cO07cymo9GWSZ34ryAx3ya4DDBiRWih9w0bm_j_R8,12497
+eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py,sha256=qbVMtD6EWKah27FogLEiEh6pBX-k2wwKbwM_kFkvYeQ,4220
+eval_lib/datagenerator/datagenerator.py,sha256=NQZIQuSCmryxIT3lTLS1PpJjENmGqARtR-zTiQ8OvRk,15513
+eval_lib/datagenerator/document_loader.py,sha256=vnQUz_Dxb3SxcVPUmMXZe-rgfPp6OfHb6D2Ie9iqPms,17025
+eval_lib/datagenerator/prompts.py,sha256=iQAYitAbLud3vWJnXGh_OCF4syWYS_S71zZEPI6qYAU,7213
+eval_lib/metrics/__init__.py,sha256=3qClCCjPXt5i0c38g5krfuQnqlAXEl-jhAHy1C_ICMY,1213
+eval_lib/metrics/answer_precision_metric/answer_precision.py,sha256=AxPmwzGFU7tnTrrZuQZ7ow4nNSD-blDHdAGwhMHMxjM,15040
+eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=-Xb9I-BVMDf5E55FbJzP6IyvD6IVTUPBI-uCrRnEboc,8522
+eval_lib/metrics/bias_metric/bias.py,sha256=BVH8xlTUTRfVG_F1kauwpGAkVKBkUWhM9rUsrrLhpRU,4020
+eval_lib/metrics/contextual_precision_metric/contextual_precision.py,sha256=CQOb6uR2KeffTkhPSqZae56sX5tXMr0pJVM5W_wU1fU,3993
+eval_lib/metrics/contextual_recall_metric/contextual_recall.py,sha256=iw73_hGLWklHZSBkCRkPDNUt1xD5dknA_7CZ6Efkf5w,3913
+eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=G1cYlA95YNcpEqQsALVi6ZbyNzWr9ccO2DATcsES5pk,6546
+eval_lib/metrics/custom_metric/custom_eval.py,sha256=Ov3-i6IytaJXlzcMgp46SRSeb8scyhqyuR2BqUtDFoM,11385
+eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK7Tq-UAJXNql8VKjgtuqlDhA,5930
+eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
+eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
+eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
+eval_ai_library-0.3.1.dist-info/METADATA,sha256=UytyyuWVrL3CuvK7hQC_y-AqoabHEPI0euolxhmfZrQ,37706
+eval_ai_library-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+eval_ai_library-0.3.1.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
+eval_ai_library-0.3.1.dist-info/RECORD,,

eval_lib/__init__.py CHANGED Viewed

@@ -1,3 +1,5 @@
+# eval_lib/__init__.py
 """
 Eval AI Library - Comprehensive AI Model Evaluation Framework
@@ -5,12 +7,12 @@ A powerful library for evaluating AI models with support for multiple LLM provid
 and a wide range of evaluation metrics for RAG systems and AI agents.
 """
-__version__ = "0.1.0"
-__author__ = "Aleksandr Meskov"
+__version__ = "0.3.1"
+__author__ = "Aleksandr Meshkov"
 # Core evaluation functions
 from eval_lib.evaluate import evaluate, evaluate_conversations
-from eval_lib.utils import score_agg
+from eval_lib.utils import score_agg, extract_json_block
 # Test case schemas
 from eval_lib.testcases_schema import (
@@ -63,9 +65,20 @@ from eval_lib.agent_metrics import (
     KnowledgeRetentionMetric
 )
-# Data generator
-from eval_lib.datagenerator.datagenerator import DataGenerator
-from eval_lib.datagenerator.document_loader import DocumentLoader
+def __getattr__(name):
+    """
+    Ленивый импорт для модулей с тяжёлыми зависимостями.
+    DataGenerator импортируется только когда реально используется.
+    """
+    if name == "DataGenerator":
+        from eval_lib.datagenerator.datagenerator import DataGenerator
+        return DataGenerator
+    if name == "DocumentLoader":
+        from eval_lib.datagenerator.document_loader import DocumentLoader
+        return DocumentLoader
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 __all__ = [
     # Version

eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py CHANGED Viewed

@@ -36,6 +36,7 @@ class KnowledgeRetentionMetric(ConversationalMetricPattern):
         model: str,
         threshold: float = 0.7,
         temperature: float = 0.5,
+        verbose: bool = False
     ):
         """
         Initialize Knowledge Retention metric.
@@ -45,7 +46,7 @@ class KnowledgeRetentionMetric(ConversationalMetricPattern):
             threshold: Success threshold (0.0-1.0)
             temperature: Score aggregation temperature for softmax
         """
-        super().__init__(model=model, threshold=threshold)
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
         self.temperature = temperature
     # ==================== HELPER METHODS ====================
@@ -214,18 +215,23 @@ Verdicts:
             "verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
             "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
             "final_score": final_score,
-            "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
+            "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
             "threshold": self.threshold,
+            "temperature": self.temperature,
             "success": success,
             "comment_success": "Whether the retention score meets the required threshold.",
             "final_reason": summary,
             "comment_reasoning": "Concise explanation of the assistant's knowledge retention performance."
         }
-        return {
+        result = {
+            "name": self.name,
             "score": final_score,
             "success": success,
             "reason": summary,
             "evaluation_cost": round(total_cost, 6),
             "evaluation_log": evaluation_log
         }
+        self.print_result(result)
+        return result

eval_lib/agent_metrics/role_adherence_metric/role_adherence.py CHANGED Viewed

@@ -36,6 +36,8 @@ class RoleAdherenceMetric(ConversationalMetricPattern):
         model: str,
         threshold: float = 0.7,
         temperature: float = 0.5,
+        verbose: bool = False,
+        chatbot_role: str = ""
     ):
         """
         Initialize Role Adherence metric.
@@ -45,8 +47,9 @@ class RoleAdherenceMetric(ConversationalMetricPattern):
             threshold: Success threshold (0.0-1.0)
             temperature: Score aggregation temperature for softmax
         """
-        super().__init__(model=model, threshold=threshold)
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
         self.temperature = temperature
+        self.role_description = chatbot_role
     # ==================== HELPER METHODS ====================
@@ -201,7 +204,7 @@ Return JSON array:
         total_cost = 0.0
         # Step 1: Extract role
-        role_description = test_case.chatbot_role or "No role specified"
+        role_description = test_case.chatbot_role or self.chatbot_role or "No role specified"
         # Step 2: Format dialogue
         dialogue_text = self._render_dialogue(test_case.turns)
@@ -234,18 +237,24 @@ Return JSON array:
             "verdict_weights": {v["verdict"]: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts},
             "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
             "final_score": final_score,
-            "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
+            "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
             "threshold": self.threshold,
+            "temperature": self.temperature,
             "success": success,
             "comment_success": "Whether the role adherence score meets the required threshold.",
             "final_reason": summary,
             "comment_reasoning": "Concise explanation of how well the assistant maintained its assigned role."
         }
-        return {
+        result = {
+            "name": self.name,
             "score": final_score,
             "success": success,
             "reason": summary,
             "evaluation_cost": round(total_cost, 6),
             "evaluation_log": evaluation_log
         }
+        self.print_result(result)
+        return result

eval_lib/agent_metrics/task_success_metric/task_success_rate.py CHANGED Viewed

@@ -11,6 +11,13 @@ from eval_lib.testcases_schema import ConversationalEvalTestCase
 from eval_lib.metric_pattern import ConversationalMetricPattern
 from eval_lib.llm_client import chat_complete
 from eval_lib.utils import score_agg, extract_json_block
+import re
+def _contains_links(dialogue: str) -> bool:
+    """Check if dialogue contains any URLs/links"""
+    url_pattern = r'https?://[^\s]+|www\.[^\s]+|\[.*?\]\(.*?\)'
+    return bool(re.search(url_pattern, dialogue))
 # Verdict weights for task completion levels
@@ -23,7 +30,7 @@ VERDICT_WEIGHTS = {
 }
 # Configuration constants
-MAX_CRITERIA = 10
+MAX_CRITERIA = 2
 LINK_CRITERION = "The user got the link to the requested resource."
@@ -34,13 +41,13 @@ class TaskSuccessRateMetric(ConversationalMetricPattern):
     """
     name = "taskSuccessRateMetric"
-    template_cls = None
     def __init__(
         self,
         model: str,
         threshold: float = 0.7,
-        temperature: float = 1.1,
+        temperature: float = 0.5,
+        verbose: bool = False
     ):
         """
         Initialize Task Success Rate metric.
@@ -50,7 +57,7 @@ class TaskSuccessRateMetric(ConversationalMetricPattern):
             threshold: Success threshold (0.0-1.0)
             temperature: Score aggregation temperature for softmax
         """
-        super().__init__(model=model, threshold=threshold)
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
         self.temperature = temperature
     # ==================== HELPER METHODS ====================
@@ -118,24 +125,20 @@ Criteria: [
         return text.strip(), cost or 0.0
-    async def _generate_success_criteria(self, goal: str) -> Tuple[List[str], float]:
+    async def _generate_success_criteria(self, goal: str, dialogue: str) -> Tuple[List[str], float]:
         """
         Generate concrete success criteria for the user's goal.
         Args:
             goal: The inferred user goal
-        Returns:
-            Tuple of (criteria_list, llm_cost)
+            dialogue: Full conversation text (needed to check for links)
         """
         prompt = (
             f"{self._prompt_criteria_few_shot()}\n\n"
             f"Now do the same for the next case.\n\n"
             f"User goal: {goal}\n\n"
             f"List up to {MAX_CRITERIA} concrete SUCCESS CRITERIA that could realistically be satisfied "
-            f"within a brief chat of 2–5 turns. "
-            "Then **add** this exact sentence: "
-            f"\"{LINK_CRITERION}\"\n\n"
+            f"within a brief chat of 2–5 turns.\n\n"
             "Each criterion must be a short, observable statement.\n"
             "Return only a JSON array of strings."
         )
@@ -153,17 +156,10 @@ Criteria: [
             if not isinstance(criteria, list):
                 raise ValueError("Expected JSON array of criteria")
-            # Ensure LINK_CRITERION is included
-            if LINK_CRITERION not in criteria:
+            # Add LINK_CRITERION only if dialogue contains links
+            if _contains_links(dialogue) and LINK_CRITERION not in criteria:
                 criteria.append(LINK_CRITERION)
-            # Keep LINK_CRITERION first and limit to MAX_CRITERIA
-            if len(criteria) > MAX_CRITERIA:
-                criteria = (
-                    [LINK_CRITERION] +
-                    [c for c in criteria if c != LINK_CRITERION][:MAX_CRITERIA - 1]
-                )
             # Truncate to MAX_CRITERIA
             criteria = criteria[:MAX_CRITERIA]
@@ -296,7 +292,7 @@ Criteria: [
         total_cost += cost
         # Step 3: Generate success criteria
-        success_criteria, cost = await self._generate_success_criteria(user_goal)
+        success_criteria, cost = await self._generate_success_criteria(user_goal, dialogue_text)
         total_cost += cost
         # Step 4: Generate verdicts for each criterion
@@ -330,18 +326,23 @@ Criteria: [
             "verdict_weights": {i: VERDICT_WEIGHTS.get(v["verdict"], 0.0) for i, v in enumerate(verdicts)},
             "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
             "final_score": final_score,
-            "comment_final_score": f"Softmax aggregation of verdict weights (temperature={self.temperature}).",
+            "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
             "threshold": self.threshold,
+            "temperature": self.temperature,
             "success": success,
             "comment_success": "Whether the task success score meets the required threshold.",
             "final_reason": summary,
             "comment_reasoning": "Concise explanation of the overall task completion assessment."
         }
-        return {
+        result = {
+            "name": self.name,
             "score": final_score,
             "success": success,
             "reason": summary,
             "evaluation_cost": round(total_cost, 6),
             "evaluation_log": evaluation_log
         }
+        self.print_result(result)
+        return result

eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py CHANGED Viewed

@@ -15,11 +15,12 @@ class ToolCorrectnessMetric(MetricPattern):
     def __init__(
         self,
         threshold: float = 0.5,
+        verbose: bool = False,
         evaluation_params: List[str] = [],
         should_exact_match: bool = False,
         should_consider_ordering: bool = False
     ):
-        super().__init__(model=None, threshold=threshold)
+        super().__init__(model=None, threshold=threshold, verbose=verbose)
         self.evaluation_params = evaluation_params
         self.should_exact_match = should_exact_match
         self.should_consider_ordering = should_consider_ordering
@@ -31,13 +32,18 @@ class ToolCorrectnessMetric(MetricPattern):
         score = self.calculate_score()
         reason = self.generate_reason()
-        return {
+        result = {
+            "name": self.name,
             "score": score,
             "success": score >= self.threshold,
             "reason": reason,
             "evaluation_cost": 0.0  # No LLM cost for this metric
         }
+        self.print_result(result)
+        return result
     def generate_reason(self) -> str:
         called_names = self.tools_called
         expected_names = self.expected_tools

eval_lib/datagenerator/datagenerator.py CHANGED Viewed

@@ -9,16 +9,27 @@ from eval_lib.utils import extract_json_block
 import asyncio
 import random
 import json
+import time
+# Colors for beautiful console output
+class Colors:
+    HEADER = '\033[95m'
+    BLUE = '\033[94m'
+    CYAN = '\033[96m'
+    GREEN = '\033[92m'
+    YELLOW = '\033[93m'
+    RED = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+    DIM = '\033[2m'
 async def retry_async(fn, *args, retries=4, base_delay=0.6, max_delay=6.0,
                       retriable_statuses=(429, 500, 502, 503, 504),
                       **kwargs):
-    """
-    fn — корутина, которая может бросить исключение вида:
-    - HTTPException-like с .status_code
-    - Exception с текстом, где встречается 'Service Unavailable' и т.п.
-    """
     attempt = 0
     while True:
         try:
@@ -34,7 +45,6 @@ async def retry_async(fn, *args, retries=4, base_delay=0.6, max_delay=6.0,
             if attempt > retries or not retriable:
                 raise
-            # экспоненциальный бэкофф + джиттер
             delay = min(max_delay, base_delay * (2 ** (attempt - 1)))
             delay += random.uniform(0, 0.4)
             await asyncio.sleep(delay)
@@ -61,6 +71,7 @@ class DatasetGenerator:
         max_chunks: int = 30,
         relevance_margin: float = 1.5,
         embedding_model: str = "openai:text-embedding-3-small",
+        verbose: bool = False,
     ):
         self.model = model
         self.input_format = input_format
@@ -78,8 +89,132 @@ class DatasetGenerator:
         self.max_chunks = max_chunks
         self.relevance_margin = relevance_margin
         self.embedding_model = embedding_model
+        self.verbose = verbose
+    def _log(self, message: str, color: str = Colors.CYAN):
+        """Log message with color if verbose mode is enabled"""
+        if self.verbose:
+            print(f"{color}{message}{Colors.ENDC}")
+    def _log_step(self, step_name: str, step_num: int = None):
+        """Log generation step"""
+        if self.verbose:
+            prefix = f"[{step_num}] " if step_num else ""
+            print(f"{Colors.DIM}  {prefix}{step_name}...{Colors.ENDC}")
+    def _log_progress(self, current: int, total: int, label: str = "Progress"):
+        """Log progress bar"""
+        if self.verbose:
+            percentage = (current / total) * 100 if total > 0 else 0
+            bar_length = 30
+            filled = int(bar_length * current / total) if total > 0 else 0
+            bar = '█' * filled + '░' * (bar_length - filled)
+            print(
+                f"{Colors.CYAN}  {label}: [{bar}] {current}/{total} ({percentage:.0f}%){Colors.ENDC}")
+    def _print_header(self, title: str):
+        """Print beautiful header"""
+        if self.verbose:
+            import shutil
+            terminal_width = shutil.get_terminal_size().columns
+            WIDTH = terminal_width // 2
+            WIDTH = max(WIDTH, 60)
+            border = "═" * WIDTH
+            title_text = f"🎯 {title}"
+            padding = WIDTH - len(title_text)
+            left_pad = padding // 2
+            right_pad = padding - left_pad
+            centered_title = " " * left_pad + title_text + " " * right_pad
+            print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
+            print(
+                f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_title}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
+            print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
+    def _print_summary(self, dataset: List[dict], elapsed_time: float, total_cost: float = 0.0):
+        """Print generation summary with full dataset in readable format"""
+        if not self.verbose:
+            return
+        import shutil
+        import textwrap
+        terminal_width = shutil.get_terminal_size().columns
+        WIDTH = terminal_width - 10
+        WIDTH = max(WIDTH, 80)
+        print(
+            f"\n{Colors.BOLD}{Colors.GREEN}✅ Dataset Generation Complete{Colors.ENDC}\n")
+        print(f"{Colors.BOLD}Summary:{Colors.ENDC}")
+        print(
+            f"  📊 Total rows generated: {Colors.YELLOW}{len(dataset)}{Colors.ENDC}")
+        print(
+            f"  ⏱️  Time elapsed: {Colors.YELLOW}{elapsed_time:.2f}s{Colors.ENDC}")
+        if total_cost > 0:
+            print(
+                f"  💰 Total cost: {Colors.BLUE}${total_cost:.6f}{Colors.ENDC}")
+        # Show full dataset
+        if dataset:
+            print(f"\n{Colors.BOLD}Generated Dataset:{Colors.ENDC}\n")
+            for idx, row in enumerate(dataset, 1):
+                # Header
+                print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}")
+                print(
+                    f"{Colors.CYAN}{Colors.BOLD}Row {idx}/{len(dataset)}:{Colors.ENDC}")
+                print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}")
+                # Fields
+                for key, value in row.items():
+                    value_str = str(value)
+                    # Key with proper formatting
+                    print(f"{Colors.BOLD}{key}:{Colors.ENDC}", end=" ")
+                    # Wrap long text to fit terminal width
+                    # Calculate available width (WIDTH - key length - 2 for ": ")
+                    available_width = WIDTH - len(key) - 2
+                    if len(value_str) <= available_width:
+                        # Short value - print on same line
+                        print(value_str)
+                    else:
+                        # Long value - wrap to multiple lines with proper indentation
+                        print()  # New line after key
+                        wrapped = textwrap.fill(
+                            value_str,
+                            width=WIDTH - 2,
+                            initial_indent="  ",
+                            subsequent_indent="  ",
+                            break_long_words=False,
+                            break_on_hyphens=False
+                        )
+                        print(f"{Colors.DIM}{wrapped}{Colors.ENDC}")
+                print()  # Spacing after row
+            print(f"{Colors.CYAN}{'─' * WIDTH}{Colors.ENDC}\n")
+            # Add spacing between rows
+            if idx < len(dataset):
+                print()
     async def generate_from_scratch(self) -> List[dict]:
+        start_time = time.time()
+        if self.verbose:
+            self._print_header("Dataset Generation from Scratch")
+            self._log(f"Configuration:", Colors.BOLD)
+            self._log(f"  Model: {self.model}")
+            self._log(f"  Max rows: {self.max_rows}")
+            self._log(f"  Test types: {', '.join(self.test_types)}")
+            self._log(f"  Language: {self.language}")
+            self._log("")
+        self._log_step("Generating prompt", 1)
         prompt = dataset_generation_from_scratch_prompt(
             max_rows=self.max_rows,
             agent_description=self.agent_description,
@@ -92,23 +227,52 @@ class DatasetGenerator:
             language=self.language
         )
-        raw, _ = await chat_complete(
+        self._log_step("Calling LLM to generate dataset", 2)
+        raw, cost = await chat_complete(
             llm=self.model,
             messages=[{"role": "user", "content": prompt}],
             temperature=self.temperature,
         )
+        self._log_step("Parsing response", 3)
         try:
             raw_json = extract_json_block(raw)
             data = json.loads(raw_json)
             assert isinstance(data, list), "not a JSON array"
+            elapsed_time = time.time() - start_time
+            self._print_summary(data, elapsed_time, cost or 0.0)
             return data
         except Exception as exc:
+            if self.verbose:
+                self._log(f"❌ Failed to parse dataset", Colors.RED)
             raise RuntimeError(f"Failed to parse dataset:\n{exc}\n\n{raw}")
     async def generate_from_documents(self, file_paths: List[str]) -> List[dict]:
+        """Generate dataset from documents"""
+        start_time = time.time()
+        total_cost = 0.0
+        if self.verbose:
+            self._print_header("Dataset Generation from Documents")
+            self._log(f"Configuration:", Colors.BOLD)
+            self._log(f"  Model: {self.model}")
+            self._log(f"  Max rows: {self.max_rows}")
+            self._log(f"  Documents: {len(file_paths)}")
+            self._log(f"  Chunk size: {self.chunk_size}")
+            self._log(f"  Test types: {', '.join(self.test_types)}")
+            self._log("")
+        self._log_step("Loading documents", 1)
         docs = load_documents(file_paths)
+        if self.verbose:
+            self._log(
+                f"  ✅ Loaded {len(file_paths)} file(s) → {len(docs)} page(s)/document(s)", Colors.GREEN)
+        self._log_step("Chunking documents", 2)
         doc_chunks = chunk_documents(docs,
                                      chunk_size=self.chunk_size,
                                      chunk_overlap=self.chunk_overlap)
@@ -117,8 +281,15 @@ class DatasetGenerator:
         if not chunks_text:
             raise ValueError("No text extracted from documents.")
+        if self.verbose:
+            self._log(f"  ✅ Created {len(chunks_text)} chunks", Colors.GREEN)
+        self._log_step("Ranking chunks by relevance", 3)
         ranked_chunks = await self._rank_chunks_by_relevance(chunks_text)
+        if self.verbose:
+            self._log(f"  ✅ Ranked {len(ranked_chunks)} chunks", Colors.GREEN)
         total_chunks = len(ranked_chunks)
         rows_per_chunk = max(1, math.ceil(self.max_rows / total_chunks))
@@ -127,11 +298,21 @@ class DatasetGenerator:
                     self.max_chunks)
         selected_chunks = ranked_chunks[:top_k]
+        if self.verbose:
+            self._log(
+                f"  📌 Selected top {len(selected_chunks)} chunks for generation", Colors.YELLOW)
+            self._log("")
         dataset: list[dict] = []
         MAX_PROMPT_CHARS = 24_000
-        for chunk in selected_chunks:
+        self._log_step(f"Generating dataset from chunks", 4)
+        for i, chunk in enumerate(selected_chunks):
+            if self.verbose:
+                self._log_progress(
+                    i + 1, len(selected_chunks), "Processing chunks")
             safe_chunk = chunk if len(
                 chunk) <= MAX_PROMPT_CHARS else chunk[:MAX_PROMPT_CHARS]
@@ -149,24 +330,39 @@ class DatasetGenerator:
                 language=self.language
             )
-            raw, _ = await retry_async(
+            raw, cost = await retry_async(
                 chat_complete,
                 llm=self.model,
                 messages=[{"role": "user", "content": prompt}],
                 temperature=self.temperature,
             )
+            total_cost += cost or 0.0
             try:
                 chunk_data = json.loads(extract_json_block(raw))
                 assert isinstance(chunk_data, list)
                 dataset.extend(chunk_data)
+                if self.verbose:
+                    self._log(
+                        f"    ✅ Generated {len(chunk_data)} rows from chunk {i+1}", Colors.GREEN)
             except Exception as exc:
-                raise RuntimeError(f"Chunk parsing error:\n{exc}\n\n{raw}")
+                if self.verbose:
+                    self._log(
+                        f"    ⚠️  Chunk {i+1} parsing failed, skipping", Colors.YELLOW)
+                continue
             if len(dataset) >= self.max_rows:
                 break
-        return dataset[: self.max_rows]
+        final_dataset = dataset[: self.max_rows]
+        elapsed_time = time.time() - start_time
+        self._print_summary(final_dataset, elapsed_time, total_cost)
+        return final_dataset
     async def _rank_chunks_by_relevance(self, chunks: list[str]) -> list[str]:
         """

eval-ai-library 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

eval-ai-library 0.2.2py3-none-any.whl → 0.3.1py3-none-any.whl