PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (725) hide show

wisent/core/evaluators/benchmark_specific/apps_evaluator.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+APPS (Automated Programming Progress Standard) Evaluator.
+Script workflow:
+1. code = APPSEvaluator.extract_code_from_json(raw_response)
+2. code = AppsExtractor.prepend_imports(code)
+3. test_code, _ = AppsExtractor.build_test_code(input_output)
+4. result = evaluator.evaluate(code, expected=None, test_code=test_code)
+Dataset: codeparrot/apps (10,000 Python coding problems)
+Paper: https://arxiv.org/abs/2105.09938
+"""
+from __future__ import annotations
+import json
+import re
+from typing import Any
+from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
+from wisent.core.evaluators.benchmark_specific.coding.metrics.evaluator import CodingEvaluator
+# Import shared utilities from extractor
+from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.apps import AppsExtractor
+class APPSEvaluator(BaseEvaluator):
+    """Evaluator for APPS benchmark. Delegates to CodingEvaluator."""
+    name = "apps"
+    description = "APPS coding benchmark evaluator"
+    def __init__(self):
+        self._coding_evaluator = CodingEvaluator()
+    # --- Static helper methods for scripts ---
+    # Reuse from AppsExtractor
+    prepend_imports = staticmethod(AppsExtractor._prepend_imports)
+    build_test_code = staticmethod(AppsExtractor._build_test_code_from_io)
+    @staticmethod
+    def get_prompt(
+        question: str,
+        starter_code: str | None = None,
+        fn_name: str | None = None,
+    ) -> str:
+        """Generate prompt for the model.
+        Args:
+            question: Problem description
+            starter_code: Optional starter code template
+            fn_name: If provided, this is a call-based (LeetCode) problem
+        Returns:
+            Formatted prompt string
+        """
+        prompt = "You are an expert Python programmer. Solve the following coding problem.\n\n"
+        prompt += f"Problem:\n{question}\n\n"
+        if starter_code and starter_code.strip():
+            prompt += f"Starter code (you must use this):\n```python\n{starter_code}\n```\n\n"
+        prompt += 'Output your solution as a JSON object: {"code": "your_python_code_here"}\n'
+        if fn_name:
+            prompt += "Implement the Solution class with the required method.\n"
+        else:
+            prompt += "Your code should read from stdin and write to stdout.\n"
+        prompt += "\nRespond with ONLY the JSON object, no other text."
+        return prompt
+    @staticmethod
+    def extract_code_from_json(response: str) -> str | None:
+        """Extract code from JSON response."""
+        # Try direct JSON parse
+        try:
+            data = json.loads(response.strip())
+            if isinstance(data, dict) and "code" in data:
+                return data["code"]
+        except json.JSONDecodeError:
+            pass
+        # Try regex for multiline JSON
+        match = re.search(r'\{\s*"code"\s*:\s*"((?:[^"\\]|\\.)*)"\s*\}', response, re.DOTALL)
+        if match:
+            try:
+                data = json.loads(match.group(0))
+                if "code" in data:
+                    return data["code"]
+            except json.JSONDecodeError:
+                pass
+        # Fallback: code blocks
+        for pattern in [r'```python\n(.*?)\n```', r'```\n(.*?)\n```']:
+            match = re.search(pattern, response, re.DOTALL)
+            if match:
+                return match.group(1)
+        return None
+    # --- Evaluator interface ---
+    def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
+        """Evaluate code against test cases.
+        Args:
+            response: Already-extracted and prepared Python code
+            expected: Not used
+            **kwargs:
+                test_code: Already-generated test code
+        Returns:
+            EvalResult from CodingEvaluator
+        """
+        test_code = kwargs.get("test_code")
+        if not test_code:
+            return EvalResult(
+                ground_truth="UNKNOWN",
+                method_used=self.name,
+                confidence=0.0,
+                details="No test_code provided",
+            )
+        return self._coding_evaluator.evaluate(
+            response=response,
+            expected=None,
+            test_code=test_code,
+            entry_point=None,
+            task_name="apps",
+            language="python",
+        )

wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py CHANGED Viewed

@@ -223,11 +223,16 @@ if __name__ == "__main__":
         try:
             from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job
-            # Set limits based on task
+            # Set limits based on task or kwargs
+            timeout_override = kwargs.get('timeout')
             if 'ds1000' in task_name.lower() or 'ds_1000' in task_name.lower():
                 cpu_limit_s = 60
                 wall_timeout_s = 120
                 nproc = 512
+            elif timeout_override:
+                cpu_limit_s = timeout_override
+                wall_timeout_s = timeout_override
+                nproc = 64
             else:
                 cpu_limit_s = self.cfg.cpu_limit_s
                 wall_timeout_s = self.cfg.time_limit_s

wisent/core/evaluators/benchmark_specific/conala_evaluator.py CHANGED Viewed

@@ -12,13 +12,12 @@ Wang Ling et al., "Latent Predictor Networks for Code Generation" (2016)
 """
 import logging
-import math
 import re
-from collections import Counter
 from typing import Any
+import evaluate
 from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
-from wisent.core.evaluators.benchmark_specific.math_parsing.extract_boxed import extract_boxed_answer
 logger = logging.getLogger(__name__)
@@ -46,134 +45,6 @@ def tokenize_for_bleu_eval(code: str) -> list[str]:
     return tokens
-def _get_ngrams(segment: list[str], max_order: int) -> Counter:
-    """Extract n-grams up to max_order from a token list.
-    Args:
-        segment: List of tokens
-        max_order: Maximum n-gram order
-    Returns:
-        Counter of n-gram frequencies
-    """
-    ngram_counts = Counter()
-    for order in range(1, max_order + 1):
-        for i in range(len(segment) - order + 1):
-            ngram = tuple(segment[i:i + order])
-            ngram_counts[ngram] += 1
-    return ngram_counts
-def compute_bleu(
-    references: list[list[str]],
-    hypotheses: list[list[str]],
-    max_order: int = 4,
-    smooth: bool = False,
-) -> tuple[float, list[float], float, float, int, int]:
-    """Compute corpus-level BLEU score.
-    Implementation follows the CoNaLa baseline. CoNaLa has one reference per example.
-    Args:
-        references: List of reference token lists (one per example).
-        hypotheses: List of hypothesis token lists (one per example).
-        max_order: Maximum n-gram order to use (default 4).
-        smooth: Whether to apply Lin smoothing (default False for CoNaLa).
-    Returns:
-        Tuple of:
-            - BLEU score (0.0 to 1.0)
-            - List of n-gram precisions
-            - Brevity penalty
-            - Length ratio (hypothesis/reference)
-            - Hypothesis length
-            - Reference length
-    """
-    matches_by_order = [0] * max_order
-    possible_matches_by_order = [0] * max_order
-    reference_length = 0
-    hypothesis_length = 0
-    for reference, hypothesis in zip(references, hypotheses):
-        reference_length += len(reference)
-        hypothesis_length += len(hypothesis)
-        # Get n-grams
-        ref_ngrams = _get_ngrams(reference, max_order)
-        hyp_ngrams = _get_ngrams(hypothesis, max_order)
-        # Count matches (clipped to reference count)
-        overlap = hyp_ngrams & ref_ngrams
-        for ngram, count in overlap.items():
-            matches_by_order[len(ngram) - 1] += count
-        # Count possible matches
-        for order in range(1, max_order + 1):
-            possible_matches = len(hypothesis) - order + 1
-            if possible_matches > 0:
-                possible_matches_by_order[order - 1] += possible_matches
-    # Compute precisions
-    precisions = [0.0] * max_order
-    for i in range(max_order):
-        if smooth:
-            precisions[i] = (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
-        else:
-            if possible_matches_by_order[i] > 0:
-                precisions[i] = matches_by_order[i] / possible_matches_by_order[i]
-            else:
-                precisions[i] = 0.0
-    # Compute geometric mean of precisions
-    if min(precisions) > 0:
-        p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
-        geo_mean = math.exp(p_log_sum)
-    else:
-        geo_mean = 0.0
-    # Compute brevity penalty
-    ratio = hypothesis_length / reference_length if reference_length > 0 else 0.0
-    if ratio > 1.0:
-        bp = 1.0
-    elif ratio == 0.0:
-        bp = 0.0
-    else:
-        bp = math.exp(1.0 - 1.0 / ratio)
-    bleu = geo_mean * bp
-    return (bleu, precisions, bp, ratio, hypothesis_length, reference_length)
-def compute_bleu_single(
-    reference: str,
-    hypothesis: str,
-    max_order: int = 4,
-    smooth: bool = False,
-) -> float:
-    """Compute BLEU score for a single reference-hypothesis pair.
-    Args:
-        reference: The reference code string
-        hypothesis: The generated code string
-        max_order: Maximum n-gram order (default 4)
-        smooth: Whether to apply smoothing (default False)
-    Returns:
-        BLEU score (0.0 to 1.0)
-    """
-    ref_tokens = tokenize_for_bleu_eval(reference)
-    hyp_tokens = tokenize_for_bleu_eval(hypothesis)
-    bleu, _, _, _, _, _ = compute_bleu(
-        references=[ref_tokens],
-        hypotheses=[hyp_tokens],
-        max_order=max_order,
-        smooth=smooth,
-    )
-    return bleu
 class CoNaLaEvaluator(BaseEvaluator):
     """Evaluator for CoNaLa code generation benchmark.
@@ -198,12 +69,12 @@ class CoNaLaEvaluator(BaseEvaluator):
         """
         self.bleu_threshold = bleu_threshold
         self.max_order = max_order
+        self.bleu_metric = evaluate.load("bleu")
     @staticmethod
     def get_prompt(
         intent: str,
         rewritten_intent: str | None = None,
-        examples: list[tuple[str, str]] | None = None,
     ) -> str:
         """Create instruction prompt for LLM to generate Python code.
@@ -217,15 +88,7 @@ class CoNaLaEvaluator(BaseEvaluator):
         """
         nl_intent = rewritten_intent if rewritten_intent else intent
-        prompt = "Generate Python code for the following task. Put final answer - python code for the task, in \\boxed{}.\n Here are examples of correct answers:\n"
-        # Add few-shot examples if provided
-        if examples:
-            for ex_intent, ex_snippet in examples:
-                prompt += f"\nTask: {ex_intent}\n\\boxed{{{ex_snippet}}}\n"
+        prompt = "Generate Python code for the following task. Put final answer, in \\boxed{}."
         prompt += f"\nTask: {nl_intent}\n"
@@ -257,17 +120,15 @@ class CoNaLaEvaluator(BaseEvaluator):
         expected_str = str(expected).strip()
         response_str = response.strip()
-        # Tokenize for logging
-        ref_tokens = tokenize_for_bleu_eval(expected_str)
-        hyp_tokens = tokenize_for_bleu_eval(response_str)
-        # Compute BLEU score
-        bleu_score = compute_bleu_single(
-            reference=expected_str,
-            hypothesis=response_str,
+        # Compute BLEU score using HuggingFace evaluate
+        result = self.bleu_metric.compute(
+            predictions=[response_str],
+            references=[[expected_str]],
+            tokenizer=tokenize_for_bleu_eval,
             max_order=self.max_order,
             smooth=False,
         )
+        bleu_score = result["bleu"]
         # Determine truthfulness based on BLEU threshold only
         is_correct = bleu_score >= self.bleu_threshold
@@ -279,8 +140,6 @@ class CoNaLaEvaluator(BaseEvaluator):
             details=f"BLEU: {bleu_score:.4f}",
             meta={
                 "bleu_score": bleu_score,
-                "expected_tokens": ref_tokens,
-                "response_tokens": hyp_tokens,
                 "bleu_threshold": self.bleu_threshold,
             }
         )
@@ -304,29 +163,33 @@ class CoNaLaEvaluator(BaseEvaluator):
         if len(responses) != len(expected_answers):
             raise ValueError("Number of responses must match number of expected answers")
-        # Tokenize all pairs
-        references = []
-        hypotheses = []
-        for response, expected in zip(responses, expected_answers):
-            ref_tokens = tokenize_for_bleu_eval(str(expected).strip())
-            hyp_tokens = tokenize_for_bleu_eval(response.strip() if response else "")
-            references.append(ref_tokens)
-            hypotheses.append(hyp_tokens)
+        # Prepare predictions and references for HF BLEU
+        predictions = [r.strip() if r else "" for r in responses]
+        references = [[str(e).strip()] for e in expected_answers]
+        # Check if all predictions are empty (would cause division by zero)
+        if all(not p for p in predictions):
+            return {
+                "bleu_score": 0.0,
+                "total": len(responses),
+                "brevity_penalty": 0.0,
+                "length_ratio": 0.0,
+                "precisions": [0.0] * self.max_order,
+            }
-        # Compute corpus BLEU
-        bleu, precisions, bp, ratio, _, _ = compute_bleu(
+        # Compute corpus BLEU using HuggingFace evaluate
+        result = self.bleu_metric.compute(
+            predictions=predictions,
             references=references,
-            hypotheses=hypotheses,
+            tokenizer=tokenize_for_bleu_eval,
             max_order=self.max_order,
             smooth=False,
         )
         return {
-            "bleu_score": bleu * 100,  # Convert to percentage like leaderboard
+            "bleu_score": result["bleu"] * 100,  # Convert to percentage like leaderboard
             "total": len(responses),
-            "brevity_penalty": bp,
-            "length_ratio": ratio,
-            "precisions": precisions,
+            "brevity_penalty": result["brevity_penalty"],
+            "length_ratio": result["length_ratio"],
+            "precisions": result["precisions"],
         }

wisent/core/evaluators/custom/examples/humanization_coherent.py CHANGED Viewed

@@ -1,14 +1,17 @@
 """
 Humanization evaluator with coherence check.
-Combines AI detection (Desklib) with coherence evaluation to ensure
-outputs are both human-like AND readable/coherent.
+Combines AI detection (Desklib) with semantic coherence evaluation to ensure
+outputs are both human-like AND actually answer the prompt coherently.
+Uses enochlev/coherence-all-mpnet-base-v2 model to check if the response
+is semantically relevant to the prompt.
 Usage:
     from wisent.core.evaluators.custom.examples.humanization_coherent import HumanizationCoherentEvaluator
     evaluator = HumanizationCoherentEvaluator()
-    result = evaluator("Some text to analyze")
+    result = evaluator("Some text to analyze", prompt="What was the question?")
 """
 from __future__ import annotations
@@ -16,108 +19,159 @@ from __future__ import annotations
 import logging
 from typing import Any, Dict, Optional
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from wisent.core.evaluators.custom.custom_evaluator import (
     CustomEvaluator,
     CustomEvaluatorConfig,
 )
-from wisent.core.evaluators.personalization.coherence import evaluate_quality
 __all__ = ["HumanizationCoherentEvaluator", "create_humanization_coherent_evaluator"]
 logger = logging.getLogger(__name__)
+# Cache for coherence model
+_coherence_model_cache = {}
+def _get_coherence_model(device: Optional[str] = None):
+    """Get cached coherence model."""
+    if "model" not in _coherence_model_cache:
+        model_name = "enochlev/coherence-all-mpnet-base-v2"
+        logger.info(f"Loading coherence model: {model_name}")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        model.eval()
+        if device:
+            model = model.to(device)
+        _coherence_model_cache["model"] = model
+        _coherence_model_cache["tokenizer"] = tokenizer
+    return _coherence_model_cache["model"], _coherence_model_cache["tokenizer"]
 class HumanizationCoherentEvaluator(CustomEvaluator):
-    """Combined humanization + coherence evaluator.
+    """Combined humanization + semantic coherence evaluator.
-    Uses Desklib for AI detection and coherence metrics to ensure
-    outputs are both human-like AND readable.
+    Uses Desklib for AI detection and a cross-encoder model to check if
+    the response actually answers the prompt coherently.
-    Final score = human_score * coherence_weight if coherence passes threshold,
-    otherwise 0.
+    Final score = human_score if coherence passes threshold, otherwise 0.
     Args:
-        coherence_threshold: Minimum coherence score (0-100) to accept output (default: 50)
-        coherence_weight: How much coherence affects final score (default: 0.5)
-        device: Device for Desklib model
+        coherence_threshold: Minimum coherence score (0-1) to accept output (default: 0.3)
+        device: Device for models
     """
     def __init__(
         self,
-        coherence_threshold: float = 50.0,
-        coherence_weight: float = 0.5,
+        coherence_threshold: float = 0.3,
         device: Optional[str] = None,
     ):
         config = CustomEvaluatorConfig(
             name="humanization_coherent",
-            description="Humanization with coherence check (higher = more human-like AND coherent)",
+            description="Humanization with semantic coherence check (higher = more human-like AND coherent)",
         )
         super().__init__(name="humanization_coherent", description=config.description, config=config)
         self.coherence_threshold = coherence_threshold
-        self.coherence_weight = coherence_weight
+        self.device = device
         # Load Desklib detector
         from wisent.core.evaluators.custom.examples.desklib_detector import DesklibDetectorEvaluator
         self._desklib = DesklibDetectorEvaluator(device=device)
+        # Load coherence model
+        self._coherence_model, self._coherence_tokenizer = _get_coherence_model(device)
+    def _score_coherence(self, prompt: str, response: str) -> float:
+        """Score how coherent the response is to the prompt."""
+        inputs = self._coherence_tokenizer(
+            prompt, response,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        )
+        if self.device:
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self._coherence_model(**inputs)
+        score = torch.sigmoid(outputs.logits).item()
+        return score
-    def evaluate_response(self, response: str, **kwargs) -> Dict[str, Any]:
-        """Evaluate response for humanization AND coherence."""
+    def evaluate_response(self, response: str, prompt: str = None, **kwargs) -> Dict[str, Any]:
+        """Evaluate response for humanization AND coherence.
-        # Use existing coherence evaluator from personalization
-        coherence_score = evaluate_quality(response)
+        Args:
+            response: The generated response text
+            prompt: The original prompt (required for coherence check)
+            **kwargs: Additional arguments
+        Returns:
+            Dict with score, human_prob, coherence_score, etc.
+        """
+        # If no prompt provided, skip coherence check (fallback behavior)
+        if not prompt:
+            logger.warning("No prompt provided, skipping coherence check")
+            desklib_result = self._desklib(response)
+            return {
+                "score": desklib_result["human_prob"],
+                "human_prob": desklib_result["human_prob"],
+                "ai_prob": desklib_result["ai_prob"],
+                "coherence_score": None,
+                "coherence_skipped": True,
+            }
+        # Check semantic coherence - does response actually answer the prompt?
+        coherence_score = self._score_coherence(prompt, response)
         # If coherence is below threshold, return 0
         if coherence_score < self.coherence_threshold:
-            logger.warning(f"Coherence check failed: {coherence_score:.1f} < {self.coherence_threshold}")
+            logger.warning(f"Coherence check failed: {coherence_score:.3f} < {self.coherence_threshold}")
+            logger.warning(f"Prompt: {prompt[:50]}...")
             logger.warning(f"Response preview: {response[:100]}...")
             return {
                 "score": 0.0,
                 "human_prob": 0.0,
                 "ai_prob": 1.0,
                 "coherence_score": coherence_score,
-                "rejected_reason": f"Coherence {coherence_score:.1f} below threshold {self.coherence_threshold}",
+                "rejected_reason": f"Coherence {coherence_score:.3f} below threshold {self.coherence_threshold}",
             }
         # Get Desklib score
         desklib_result = self._desklib(response)
         human_prob = desklib_result["human_prob"]
-        # Combine scores: human_prob * (coherence_factor)
-        # coherence_factor scales from coherence_weight to 1.0 based on coherence
-        coherence_normalized = coherence_score / 100.0  # 0-1
-        coherence_factor = self.coherence_weight + (1.0 - self.coherence_weight) * coherence_normalized
-        final_score = human_prob * coherence_factor
+        # Score is human_prob weighted by coherence
+        # Higher coherence = closer to full human_prob score
+        final_score = human_prob * coherence_score
         return {
             "score": final_score,
             "human_prob": human_prob,
             "ai_prob": desklib_result["ai_prob"],
             "coherence_score": coherence_score,
-            "coherence_factor": coherence_factor,
         }
 def create_humanization_coherent_evaluator(
-    coherence_threshold: float = 50.0,
-    coherence_weight: float = 0.5,
+    coherence_threshold: float = 0.3,
     device: Optional[str] = None,
     **kwargs
 ) -> HumanizationCoherentEvaluator:
     """Create a humanization + coherence evaluator.
     Args:
-        coherence_threshold: Minimum coherence score to accept (default: 50)
-        coherence_weight: How much coherence affects score (default: 0.5)
-        device: Device for Desklib model
+        coherence_threshold: Minimum coherence score (0-1) to accept (default: 0.3)
+        device: Device for models
     Returns:
         HumanizationCoherentEvaluator instance
     """
     return HumanizationCoherentEvaluator(
         coherence_threshold=coherence_threshold,
-        coherence_weight=coherence_weight,
         device=device,
     )

wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py CHANGED Viewed

@@ -127,26 +127,8 @@ class TruthfulQAGenEvaluator(BaseEvaluator):
         margin = sim_correct - sim_incorrect
-        # Decision logic
-        if max(sim_correct, sim_incorrect) < self.MIN_SIMILARITY_THRESHOLD:
-            return EvalResult(
-                ground_truth="UNKNOWN",
-                method_used=self.name,
-                confidence=0.0,
-                details=f"Response similarity too low to both correct ({sim_correct:.3f}) and incorrect ({sim_incorrect:.3f}) answers",
-                meta=meta,
-            )
-        if abs(margin) < self.MIN_MARGIN:
-            return EvalResult(
-                ground_truth="UNKNOWN",
-                method_used=self.name,
-                confidence=0.0,
-                details=f"Margin too small ({margin:.3f}) to distinguish correct from incorrect",
-                meta=meta,
-            )
-        if sim_correct > sim_incorrect:
+        # Decision logic - always return TRUTHFUL or UNTRUTHFUL based on margin sign
+        if sim_correct >= sim_incorrect:
             # More similar to correct answers = TRUTHFUL
             confidence = min(1.0, 0.5 + margin)
             return EvalResult(

wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl