PyPI - eval-ai-library - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

eval-ai-library 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of eval-ai-library might be problematic. Click here for more details.

Files changed (29) hide show

eval_ai_library-0.3.0.dist-info/METADATA +1042 -0
eval_ai_library-0.3.0.dist-info/RECORD +34 -0
eval_lib/__init__.py +19 -6
eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +8 -3
eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +12 -4
eval_lib/agent_metrics/task_success_metric/task_success_rate.py +23 -23
eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
eval_lib/datagenerator/datagenerator.py +208 -12
eval_lib/datagenerator/document_loader.py +29 -29
eval_lib/evaluate.py +0 -22
eval_lib/llm_client.py +223 -78
eval_lib/metric_pattern.py +208 -152
eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +7 -2
eval_lib/metrics/bias_metric/bias.py +12 -2
eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +8 -2
eval_lib/metrics/custom_metric/custom_eval.py +237 -204
eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
eval_lib/metrics/geval/geval.py +8 -2
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
eval_lib/utils.py +44 -29
eval_ai_library-0.2.1.dist-info/METADATA +0 -753
eval_ai_library-0.2.1.dist-info/RECORD +0 -34
{eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/WHEEL +0 -0
{eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/licenses/LICENSE +0 -0
{eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/top_level.txt +0 -0

eval_lib/metric_pattern.py CHANGED Viewed

@@ -10,8 +10,8 @@ from eval_lib.testcases_schema import EvalTestCase, ConversationalEvalTestCase
 from eval_lib.llm_client import chat_complete
-# ANSI color codes for beautiful console output
 class Colors:
+    """ANSI color codes for beautiful console output"""
     HEADER = '\033[95m'
     BLUE = '\033[94m'
     CYAN = '\033[96m'
@@ -31,7 +31,7 @@ class MetricPattern:
     """
     name: str  # name of the metric
-    def __init__(self, model: str, threshold: float, verbose: bool = True):
+    def __init__(self, model: str, threshold: float, verbose: bool = False):
         self.model = model
         self.threshold = threshold
         self.verbose = verbose
@@ -47,74 +47,111 @@ class MetricPattern:
             prefix = f"[{step_num}] " if step_num else ""
             print(f"{Colors.DIM}  {prefix}{step_name}...{Colors.ENDC}")
-    async def evaluate(self, test_case: Union[EvalTestCase]) -> Dict[str, Any]:
+    def print_result(self, result: Dict[str, Any]):
         """
-        Base evaluation method - override in subclasses for custom behavior.
+        Print evaluation result based on verbose setting.
+        If verbose=False: simple dict print
+        If verbose=True: beautiful formatted output with colors
         """
-        start_time = time.time()
-        if self.verbose:
-            print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
-            print(f"{Colors.BOLD}{Colors.BLUE}🔍 Evaluating: {self.name}{Colors.ENDC}")
-            print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
-            print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
-            print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
-        self._log_step("Generating evaluation prompt", 1)
-        # 1) Generate prompt
-        prompt = self.template.generate_prompt(
-            test_case=test_case,
-            threshold=self.threshold
-        )
-        self._log_step("Calling LLM", 2)
-        # 2) Make API call
-        text, cost = await chat_complete(
-            self.model,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0.0
-        )
-        self._log_step("Parsing response", 3)
-        # 3) Parse the response
-        try:
-            data = json.loads(text)
-        except Exception as e:
-            self._log(f"❌ Failed to parse JSON: {e}", Colors.RED)
-            raise RuntimeError(
-                f"Cannot parse JSON from model response: {e}\n{text}")
-        score = float(data.get("score", 0.0))
-        reason = data.get("reason")
-        success = score >= self.threshold
-        # Calculate elapsed time
-        elapsed_time = time.time() - start_time
-        # Log results
-        if self.verbose:
-            print(f"\n{Colors.BOLD}📊 Results:{Colors.ENDC}")
-            score_color = Colors.GREEN if success else Colors.RED
-            success_icon = "✅" if success else "❌"
-            print(
-                f"  {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
-            print(
-                f"  📈 Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
-            print(f"  💰 Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
-            print(f"  ⏱️  Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
-            if reason:
-                print(
-                    f"  💬 Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
-        return {
-            "score": score,
-            "success": success,
-            "reason": reason,
-            "evaluation_cost": cost,
-        }
+        if not self.verbose:
+            print(result)
+            return
+        import shutil
+        import textwrap
+        import re
+        import json
+        # Получаем ширину терминала и делим пополам
+        terminal_width = shutil.get_terminal_size().columns
+        WIDTH = terminal_width // 2
+        WIDTH = max(WIDTH, 60)  # Минимум 60 символов
+        # Функция для переноса длинного текста
+        def wrap_text(text, width, indent=0):
+            """Переносит текст на несколько строк с отступом"""
+            wrapper = textwrap.TextWrapper(
+                width=width - indent,
+                initial_indent=' ' * indent,
+                subsequent_indent=' ' * indent,
+                break_long_words=True,
+                break_on_hyphens=False
+            )
+            return wrapper.fill(text)
+        success = result.get('success', False)
+        score = result.get('score', 0.0)
+        reason = result.get('reason', 'N/A')
+        cost = result.get('evaluation_cost', 0.0)
+        evaluation_log = result.get('evaluation_log', None)
+        status_icon = "✅" if success else "❌"
+        status_color = Colors.GREEN if success else Colors.RED
+        status_text = "PASSED" if success else "FAILED"
+        bar_length = min(30, WIDTH - 30)  # Адаптивная длина прогресс-бара
+        filled = int(bar_length * score)
+        bar = '█' * filled + '░' * (bar_length - filled)
+        metric_name = result.get('name', self.name)
+        formatted_name = f"📊 {metric_name}"
+        # Центрируем заголовок
+        name_len = len(formatted_name)
+        if name_len > WIDTH:
+            formatted_name = formatted_name[:WIDTH-3] + "..."
+            centered_name = formatted_name
+        else:
+            padding = WIDTH - name_len
+            left_pad = padding // 2
+            right_pad = padding - left_pad
+            centered_name = " " * left_pad + formatted_name + " " * right_pad
+        # Рамка заголовка
+        border = "═" * WIDTH
+        print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
+        print(f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_name}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
+        print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
+        print(f"{Colors.BOLD}Status:{Colors.ENDC} {status_icon} {status_color}{Colors.BOLD}{status_text}{Colors.ENDC}")
+        print(
+            f"{Colors.BOLD}Score:{Colors.ENDC}  {Colors.YELLOW}{score:.2f}{Colors.ENDC} [{bar}] {score*100:.0f}%")
+        print(
+            f"{Colors.BOLD}Cost:{Colors.ENDC}   {Colors.BLUE}💰 ${cost:.6f}{Colors.ENDC}")
+        # Переносим Reason на несколько строк если нужно
+        print(f"{Colors.BOLD}Reason:{Colors.ENDC}")
+        wrapped_reason = wrap_text(reason, WIDTH, indent=2)
+        print(f"{Colors.DIM}{wrapped_reason}{Colors.ENDC}\n")
+        if evaluation_log:
+            log_json = json.dumps(evaluation_log, indent=2, ensure_ascii=False)
+            log_lines = log_json.split('\n')
+            print(f"{Colors.BOLD}Evaluation Log:{Colors.ENDC}")
+            log_border = "─" * WIDTH
+            print(f"{Colors.DIM}╭{log_border}╮{Colors.ENDC}")
+            for line in log_lines:
+                # Если строка длиннее WIDTH, переносим
+                if len(line) > WIDTH - 4:
+                    # Разбиваем длинную строку
+                    wrapped_lines = textwrap.wrap(line, width=WIDTH - 4,
+                                                  break_long_words=True,
+                                                  break_on_hyphens=False)
+                    for wrapped_line in wrapped_lines:
+                        spaces_needed = WIDTH - len(wrapped_line) - 2
+                        print(
+                            f"{Colors.DIM}│{Colors.ENDC} {wrapped_line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
+                else:
+                    spaces_needed = WIDTH - len(line) - 2
+                    print(
+                        f"{Colors.DIM}│{Colors.ENDC} {line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
+            print(f"{Colors.DIM}╰{log_border}╯{Colors.ENDC}")
+        print(f"{Colors.DIM}{'─' * WIDTH}{Colors.ENDC}\n")
 class ConversationalMetricPattern:
@@ -123,16 +160,11 @@ class ConversationalMetricPattern:
     Used for metrics like RoleAdherence, DialogueCoherence, etc.
     """
     name: str
-    template_cls: Type
-    def __init__(self, model: str, threshold: float, verbose: bool = True):
+    def __init__(self, model: str, threshold: float, verbose: bool = False):
         self.model = model
         self.threshold = threshold
         self.verbose = verbose
-        if self.template_cls:
-            self.template = self.template_cls()
-        else:
-            self.template = None
         self.chatbot_role: Optional[str] = None
     def _log(self, message: str, color: str = Colors.CYAN):
@@ -146,84 +178,108 @@ class ConversationalMetricPattern:
             prefix = f"[{step_num}] " if step_num else ""
             print(f"{Colors.DIM}  {prefix}{step_name}...{Colors.ENDC}")
-    async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
+    def print_result(self, result: Dict[str, Any]):
         """
-        Evaluate conversational test case with logging.
+        Print evaluation result based on verbose setting.
+        If verbose=False: simple dict print
+        If verbose=True: beautiful formatted output with colors
         """
-        start_time = time.time()
-        if self.verbose:
-            print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
-            print(
-                f"{Colors.BOLD}{Colors.BLUE}💬 Evaluating Conversation: {self.name}{Colors.ENDC}")
-            print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
-            print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
-            print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
-            print(f"{Colors.DIM}Turns: {len(test_case.turns)}{Colors.ENDC}")
-        self._log_step("Generating evaluation prompt", 1)
-        # 1. Generate prompt
-        if hasattr(self.template, "generate_prompt"):
-            try:
-                prompt = self.template.generate_prompt(
-                    test_case=test_case,
-                    threshold=self.threshold,
-                    chatbot_role=self.chatbot_role
-                )
-            except TypeError:
-                prompt = self.template.generate_prompt(
-                    test_case=test_case,
-                    threshold=self.threshold,
-                    temperature=0.0
-                )
+        if not self.verbose:
+            print(result)
+            return
+        import shutil
+        import textwrap
+        import re
+        import json
+        # Получаем ширину терминала и делим пополам
+        terminal_width = shutil.get_terminal_size().columns
+        WIDTH = terminal_width // 2
+        WIDTH = max(WIDTH, 60)  # Минимум 60 символов
+        # Функция для переноса длинного текста
+        def wrap_text(text, width, indent=0):
+            """Переносит текст на несколько строк с отступом"""
+            wrapper = textwrap.TextWrapper(
+                width=width - indent,
+                initial_indent=' ' * indent,
+                subsequent_indent=' ' * indent,
+                break_long_words=True,
+                break_on_hyphens=False
+            )
+            return wrapper.fill(text)
+        success = result.get('success', False)
+        score = result.get('score', 0.0)
+        reason = result.get('reason', 'N/A')
+        cost = result.get('evaluation_cost', 0.0)
+        evaluation_log = result.get('evaluation_log', None)
+        status_icon = "✅" if success else "❌"
+        status_color = Colors.GREEN if success else Colors.RED
+        status_text = "PASSED" if success else "FAILED"
+        bar_length = min(30, WIDTH - 30)  # Адаптивная длина прогресс-бара
+        filled = int(bar_length * score)
+        bar = '█' * filled + '░' * (bar_length - filled)
+        metric_name = result.get('name', self.name)
+        formatted_name = f"📊 {metric_name}"
+        # Центрируем заголовок
+        name_len = len(formatted_name)
+        if name_len > WIDTH:
+            formatted_name = formatted_name[:WIDTH-3] + "..."
+            centered_name = formatted_name
         else:
-            raise RuntimeError("Template is missing method generate_prompt")
-        self._log_step("Calling LLM", 2)
-        # 2. Call API
-        text, cost = await chat_complete(
-            self.model,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0.0
-        )
-        self._log_step("Parsing response", 3)
-        # 3. Parse response
-        try:
-            data = json.loads(text)
-        except Exception as e:
-            self._log(f"❌ Failed to parse JSON: {e}", Colors.RED)
-            raise RuntimeError(
-                f"Cannot parse JSON from model response: {e}\n{text}")
-        score = float(data.get("score", 0.0))
-        reason = data.get("reason")
-        success = score >= self.threshold
-        # Calculate elapsed time
-        elapsed_time = time.time() - start_time
-        # Log results
-        if self.verbose:
-            print(f"\n{Colors.BOLD}📊 Results:{Colors.ENDC}")
-            score_color = Colors.GREEN if success else Colors.RED
-            success_icon = "✅" if success else "❌"
-            print(
-                f"  {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
-            print(
-                f"  📈 Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
-            print(f"  💰 Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
-            print(f"  ⏱️  Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
-            if reason:
-                print(
-                    f"  💬 Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
-        return {
-            "score": score,
-            "success": success,
-            "reason": reason,
-            "evaluation_cost": cost,
-        }
+            padding = WIDTH - name_len
+            left_pad = padding // 2
+            right_pad = padding - left_pad
+            centered_name = " " * left_pad + formatted_name + " " * right_pad
+        # Рамка заголовка
+        border = "═" * WIDTH
+        print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
+        print(f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_name}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
+        print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
+        print(f"{Colors.BOLD}Status:{Colors.ENDC} {status_icon} {status_color}{Colors.BOLD}{status_text}{Colors.ENDC}")
+        print(
+            f"{Colors.BOLD}Score:{Colors.ENDC}  {Colors.YELLOW}{score:.2f}{Colors.ENDC} [{bar}] {score*100:.0f}%")
+        print(
+            f"{Colors.BOLD}Cost:{Colors.ENDC}   {Colors.BLUE}💰 ${cost:.6f}{Colors.ENDC}")
+        # Переносим Reason на несколько строк если нужно
+        print(f"{Colors.BOLD}Reason:{Colors.ENDC}")
+        wrapped_reason = wrap_text(reason, WIDTH, indent=2)
+        print(f"{Colors.DIM}{wrapped_reason}{Colors.ENDC}\n")
+        if evaluation_log:
+            log_json = json.dumps(evaluation_log, indent=2, ensure_ascii=False)
+            log_lines = log_json.split('\n')
+            print(f"{Colors.BOLD}Evaluation Log:{Colors.ENDC}")
+            log_border = "─" * WIDTH
+            print(f"{Colors.DIM}╭{log_border}╮{Colors.ENDC}")
+            for line in log_lines:
+                # Если строка длиннее WIDTH, переносим
+                if len(line) > WIDTH - 4:
+                    # Разбиваем длинную строку
+                    wrapped_lines = textwrap.wrap(line, width=WIDTH - 4,
+                                                  break_long_words=True,
+                                                  break_on_hyphens=False)
+                    for wrapped_line in wrapped_lines:
+                        spaces_needed = WIDTH - len(wrapped_line) - 2
+                        print(
+                            f"{Colors.DIM}│{Colors.ENDC} {wrapped_line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
+                else:
+                    spaces_needed = WIDTH - len(line) - 2
+                    print(
+                        f"{Colors.DIM}│{Colors.ENDC} {line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
+            print(f"{Colors.DIM}╰{log_border}╯{Colors.ENDC}")
+        print(f"{Colors.DIM}{'─' * WIDTH}{Colors.ENDC}\n")

eval_lib/metrics/answer_precision_metric/answer_precision.py CHANGED Viewed

@@ -178,8 +178,8 @@ class PrecisionConfig:
 class AnswerPrecisionMetric(MetricPattern):
     name = "answerPrecisionMetric"
-    def __init__(self, model: str, threshold: float = 0.8, config: Optional[PrecisionConfig] = None):
-        super().__init__(model=model, threshold=threshold)
+    def __init__(self, model: str, threshold: float = 0.8, verbose: bool = False, config: Optional[PrecisionConfig] = None):
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
         self.config = config or PrecisionConfig()
     # --- core similarity components ---
@@ -395,7 +395,8 @@ class AnswerPrecisionMetric(MetricPattern):
             },
         }
-        return {
+        result = {
+            "name": self.name,
             "score": round(final_score, 4),
             "success": success,
             "reason": reason,
@@ -403,3 +404,7 @@ class AnswerPrecisionMetric(MetricPattern):
             "evaluation_cost": 0.0,
             "evaluation_log": evaluation_log,
         }
+        self.print_result(result)
+        return result

eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py CHANGED Viewed

@@ -34,8 +34,9 @@ class AnswerRelevancyMetric(MetricPattern):
         model: str,
         threshold: float = 0.6,
         temperature: float = 0.5,
+        verbose: bool = False
     ):
-        super().__init__(model=model, threshold=threshold)
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
         self.temperature = temperature
     async def _infer_user_intent(self, question: str) -> str:
@@ -186,10 +187,14 @@ class AnswerRelevancyMetric(MetricPattern):
             "comment_reasoning": "Compressed explanation of the key verdict rationales."
         }
-        return {
+        result = {
+            "name": self.name,
             "score": final_score,
             "success": success,
             "reason": summary_reason,
             "evaluation_cost": round(llm_cost, 6),
             "evaluation_log": evaluation_log
         }
+        self.print_result(result)
+        return result

eval_lib/metrics/bias_metric/bias.py CHANGED Viewed

@@ -12,7 +12,14 @@ from eval_lib.llm_client import chat_complete
 class BiasMetric(MetricPattern):
     name = "biasMetric"
-    template_cls = None  # all prompts inside the class
+    def __init__(
+        self,
+        model: str,
+        threshold: float = 0.8,
+        verbose: bool = False,
+    ):
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
     # ==================== PROMPTS ====================
@@ -105,10 +112,13 @@ JSON:"""
             "comment_reasoning": "Explanation of the bias assessment, including specific biased elements if found."
         }
-        return {
+        result = {
+            "name": self.name,
             "score": score,
             "success": success,
             "reason": reason,
             "evaluation_cost": round(total_cost, 6),
             "evaluation_log": evaluation_log
         }
+        self.print_result(result)
+        return result

eval_lib/metrics/contextual_precision_metric/contextual_precision.py CHANGED Viewed

@@ -18,9 +18,9 @@ from eval_lib.utils import extract_json_block
 class ContextualPrecisionMetric(MetricPattern):
     name = "contextPrecisionMetric"
-    def __init__(self, model: str, threshold: float = 0.7, top_k: int | None = None, ):
-        super().__init__(model=model, threshold=threshold)
-        self.top_k = top_k            # limit of chunks inspected (None = all)
+    def __init__(self, model: str, threshold: float = 0.7, top_k: int | None = None, verbose: bool = False):
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
+        self.top_k = top_k
     # ------------------------------------------------------------------ #
     async def _is_chunk_relevant(                       # judgement = 0 / 1
@@ -93,10 +93,15 @@ class ContextualPrecisionMetric(MetricPattern):
             "comment_success": "Whether precision meets threshold."
         }
-        return {
+        result = {
+            "name": self.name,
             "score": ctx_precision,
             "success": success,
             "reason": f"Average precision across top-{len(chunks)} context chunks.",
             "evaluation_cost": round(llm_cost, 6),
             "evaluation_log": evaluation_log,
         }
+        self.print_result(result)
+        return result

eval_lib/metrics/contextual_recall_metric/contextual_recall.py CHANGED Viewed

@@ -18,8 +18,8 @@ from eval_lib.utils import extract_json_block
 class ContextualRecallMetric(MetricPattern):
     name = "contextualRecallMetric"
-    def __init__(self, model: str, threshold: float = 0.7):
-        super().__init__(model=model, threshold=threshold)
+    def __init__(self, model: str, threshold: float = 0.7, verbose: bool = False):
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
     async def _extract_claims(self, reference: str) -> Tuple[List[str], float]:
         prompt = (
@@ -82,10 +82,14 @@ class ContextualRecallMetric(MetricPattern):
             "comment_success": "Whether the score exceeds the threshold.",
         }
-        return {
+        result = {
+            "name": self.name,
             "score": recall_score,
             "success": success,
             "reason": f"{supported_count} out of {total_claims} reference claims supported by context.",
             "evaluation_cost": round(llm_cost, 6),
             "evaluation_log": evaluation_log
         }
+        self.print_result(result)
+        return result

eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py CHANGED Viewed

@@ -34,8 +34,9 @@ class ContextualRelevancyMetric(MetricPattern):
             model: str,
             threshold: float = 0.6,
             temperature: float = 0.5,
+            verbose: bool = False
     ):
-        super().__init__(model=model, threshold=threshold)
+        super().__init__(model=model, threshold=threshold, verbose=verbose)
         self.temperature = temperature
     async def _infer_user_intent(self, question: str) -> Tuple[str, float]:
@@ -160,10 +161,15 @@ class ContextualRelevancyMetric(MetricPattern):
             "comment_reasoning": "LLM-generated explanation based on verdict rationales."
         }
-        return {
+        result = {
+            "name": self.name,
             "score": score,
             "success": success,
             "reason": summary,
             "evaluation_cost": round(llm_cost, 6),
             "evaluation_log": evaluation_log
         }
+        self.print_result(result)
+        return result

eval-ai-library 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

eval-ai-library 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl