PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import random
 from typing import TYPE_CHECKING
 from wisent.core.contrastive_pairs.lm_eval_pairs.lm_extractor_registry import get_extractor
@@ -10,17 +11,178 @@ if TYPE_CHECKING:
     from lm_eval.api.task import ConfigurableTask
     from wisent.core.contrastive_pairs.core.pair import ContrastivePair
-__all__ = ["build_contrastive_pairs"]
+__all__ = ["build_contrastive_pairs", "lm_build_contrastive_pairs"]
 _LOG = setup_logger(__name__)
+def _flatten_task_dict(task_dict: dict, prefix: str = "") -> list[tuple[str, "ConfigurableTask"]]:
+    """
+    Recursively flatten nested group tasks into a list of (name, ConfigurableTask) tuples.
+    arguments:
+        task_dict: Dict of task_name -> ConfigurableTask or nested dict
+        prefix: Prefix for nested task names
+    returns:
+        List of (full_task_name, ConfigurableTask) tuples (leaf tasks only)
+    """
+    from lm_eval.api.task import ConfigurableTask
+    result = []
+    for name, task in task_dict.items():
+        full_name = f"{prefix}/{name}" if prefix else name
+        if isinstance(task, ConfigurableTask):
+            result.append((full_name, task))
+        elif isinstance(task, dict):
+            # Nested group - recurse
+            result.extend(_flatten_task_dict(task, full_name))
+    return result
+def _add_evaluator_to_pairs(
+    pairs: list["ContrastivePair"],
+    evaluator_name: str | None,
+    task_name: str,
+) -> list["ContrastivePair"]:
+    """Add evaluator_name and task_name to each pair's metadata."""
+    from dataclasses import replace
+    result = []
+    for pair in pairs:
+        metadata = dict(pair.metadata) if pair.metadata else {}
+        metadata["evaluator_name"] = evaluator_name
+        metadata["source_task"] = task_name
+        result.append(replace(pair, metadata=metadata))
+    return result
+def build_contrastive_pairs(
+    task_name: str,
+    limit: int | None = None,
+) -> list["ContrastivePair"]:
+    """
+    Unified loader for contrastive pairs - handles both HuggingFace and lm-eval tasks.
+    Automatically:
+    - Detects if task is HF or lm-eval
+    - Handles group tasks (including nested groups) by sampling from all subtasks
+    - Adds evaluator_name to each pair's metadata
+    arguments:
+        task_name:
+            Name of the benchmark/task (e.g., "winogrande", "mmlu", "humaneval").
+        limit:
+            Optional upper bound on the number of pairs to return.
+            Values <= 0 are treated as "no limit".
+    returns:
+        A list of ContrastivePair objects, each with metadata containing
+        'evaluator_name' and 'source_task'.
+    """
+    log = bind(_LOG, task=task_name or "unknown")
+    log.info("Building contrastive pairs (unified)", extra={"limit": limit})
+    # Normalize limit
+    max_items = None if (limit is None or limit <= 0) else int(limit)
+    # Get extractor
+    extractor = get_extractor(task_name)
+    log.info("Using extractor", extra={"extractor": extractor.__class__.__name__})
+    # Get evaluator_name from extractor
+    evaluator_name = getattr(extractor, 'evaluator_name', None)
+    # HuggingFace extractor - load directly
+    if isinstance(extractor, HuggingFaceBenchmarkExtractor):
+        log.info("HuggingFace task - loading directly")
+        pairs = extractor.extract_contrastive_pairs(limit=max_items)
+        return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
+    # lm-eval extractor - need to load task
+    log.info("lm-eval task - loading via LMEvalDataLoader")
+    from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
+    loader = LMEvalDataLoader()
+    try:
+        task_obj = loader.load_lm_eval_task(task_name)
+    except Exception as e:
+        log.error(f"Failed to load lm-eval task: {e}")
+        raise
+    # Single task (ConfigurableTask)
+    from lm_eval.api.task import ConfigurableTask
+    if isinstance(task_obj, ConfigurableTask):
+        log.info("Single task")
+        pairs = extractor.extract_contrastive_pairs(task_obj, limit=max_items)
+        return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
+    # Group task (dict) - flatten and sample from all subtasks
+    if isinstance(task_obj, dict):
+        leaf_tasks = _flatten_task_dict(task_obj)
+        log.info(f"Group task with {len(leaf_tasks)} leaf subtasks")
+        if not leaf_tasks:
+            log.warning("No leaf tasks found in group")
+            return []
+        # Shuffle to get random sampling across subtasks
+        random.shuffle(leaf_tasks)
+        # Calculate pairs per subtask
+        if max_items is None:
+            pairs_per_task = None
+        else:
+            # Distribute limit across subtasks, minimum 1 per task
+            pairs_per_task = max(1, max_items // len(leaf_tasks))
+        all_pairs = []
+        for subtask_name, subtask in leaf_tasks:
+            try:
+                # Get the leaf task name (last part after /)
+                leaf_name = subtask_name.split("/")[-1] if "/" in subtask_name else subtask_name
+                # Try to get extractor for the specific subtask first
+                try:
+                    subtask_extractor = get_extractor(leaf_name)
+                except:
+                    # Fall back to parent extractor
+                    subtask_extractor = extractor
+                subtask_evaluator = getattr(subtask_extractor, 'evaluator_name', evaluator_name)
+                subtask_pairs = subtask_extractor.extract_contrastive_pairs(subtask, limit=pairs_per_task)
+                subtask_pairs = _add_evaluator_to_pairs(subtask_pairs, subtask_evaluator, subtask_name)
+                all_pairs.extend(subtask_pairs)
+                # Stop if we have enough
+                if max_items is not None and len(all_pairs) >= max_items:
+                    break
+            except Exception as e:
+                log.warning(f"Failed to extract from subtask {subtask_name}: {e}")
+                continue
+        # Shuffle final result and trim to limit
+        random.shuffle(all_pairs)
+        if max_items is not None:
+            all_pairs = all_pairs[:max_items]
+        log.info(f"Extracted {len(all_pairs)} pairs from group task")
+        return all_pairs
+    log.error(f"Unexpected task_obj type: {type(task_obj)}")
+    return []
 def lm_build_contrastive_pairs(
     task_name: str,
-    lm_eval_task: ConfigurableTask | None,
+    lm_eval_task: "ConfigurableTask | None",
     limit: int | None = None,
-) -> list[ContrastivePair]:
+) -> list["ContrastivePair"]:
     """
-    Resolve the task's extractor (lazy-loaded) and return contrastive pairs.
+    Legacy function - resolve the task's extractor and return contrastive pairs.
+    For new code, prefer using build_contrastive_pairs() which handles
+    task loading automatically.
     arguments:
         task_name:
@@ -47,10 +209,15 @@ def lm_build_contrastive_pairs(
     max_items = None if (limit is None or limit <= 0) else int(limit)
     log.info("Extracting contrastive pairs", extra={"max_items": max_items})
+    # Get evaluator_name from extractor
+    evaluator_name = getattr(extractor, 'evaluator_name', None)
     # 3) Delegate: extractor loads docs and builds pairs
     # HuggingFace extractors don't need lm_eval_task - they load data directly from HuggingFace
     if isinstance(extractor, HuggingFaceBenchmarkExtractor):
-        return extractor.extract_contrastive_pairs(limit=max_items)
+        pairs = extractor.extract_contrastive_pairs(limit=max_items)
     else:
-        return extractor.extract_contrastive_pairs(lm_eval_task, limit=max_items)
+        pairs = extractor.extract_contrastive_pairs(lm_eval_task, limit=max_items)
+    return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)

wisent/core/data_loaders/loaders/lm_loader.py CHANGED Viewed

@@ -10,6 +10,10 @@ os.environ['TF_NUM_INTEROP_THREADS'] = '1'
 os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
 os.environ['OMP_NUM_THREADS'] = '1'
+# Allow code evaluation for code-related tasks (humaneval, etc.)
+# Required by HuggingFace evaluate library for code_eval metric
+os.environ['HF_ALLOW_CODE_EVAL'] = '1'
 # Enable trust_remote_code for all datasets (required for meddialog and others)
 # This uses lm-eval's recommended approach from PR #1998
 import datasets.config
@@ -294,6 +298,8 @@ class LMEvalDataLoader(BaseDataLoader):
             "tinytruthfulqa": "tinyTruthfulQA",
             "tinywinogrande": "tinyWinogrande",
             "paws-x": "pawsx",
+            # afrobench subtasks
+            "afrobench_adr": "adr",
         }
         # Use mapped name if available, otherwise use original
@@ -302,7 +308,9 @@ class LMEvalDataLoader(BaseDataLoader):
             log.info(f"Mapping task '{task_name}' to lm-eval task '{lm_eval_task_name}'")
         # Tasks that require case-sensitive names (don't lowercase these)
-        case_sensitive_prefixes = {"tinyBenchmarks"}
+        # AraDiCE tasks have mixed case (e.g., AraDiCE_ArabicMMLU_lev)
+        # aexams tasks have mixed case (e.g., aexams_IslamicStudies)
+        case_sensitive_prefixes = {"tinyBenchmarks", "AraDiCE", "aexams_"}
         # Normalize task name to lowercase for lm-eval-harness compatibility
         # Many lm-eval tasks use lowercase names (e.g., "aradice" not "AraDICE")
@@ -379,6 +387,9 @@ class LMEvalDataLoader(BaseDataLoader):
             "noreval": ["ask_gec_p0", "ask_gec_p1", "ask_gec_p2", "ask_gec_p3", "ask_gec_p4", "ncb", "norbelebele_p0", "norbelebele_p1", "norbelebele_p2", "norbelebele_p3", "norbelebele_p4", "norcommonsenseqa_nno_p0", "norcommonsenseqa_nno_p1", "norcommonsenseqa_nno_p2", "norcommonsenseqa_nno_p3", "norcommonsenseqa_nno_p4", "norcommonsenseqa_nob_p0", "norcommonsenseqa_nob_p1", "norcommonsenseqa_nob_p2", "norcommonsenseqa_nob_p3", "norcommonsenseqa_nob_p4", "norec_document_p0", "norec_document_p1", "norec_document_p2", "norec_document_p3", "norec_document_p4", "norec_sentence_p0", "norec_sentence_p1", "norec_sentence_p2", "norec_sentence_p3", "norec_sentence_p4", "noridiom_nno_p0", "noridiom_nno_p1", "noridiom_nno_p2", "noridiom_nno_p3", "noridiom_nno_p4", "noridiom_nob_p0", "noridiom_nob_p1", "noridiom_nob_p2", "noridiom_nob_p3", "noridiom_nob_p4", "noropenbookqa_nno_p0", "noropenbookqa_nno_p1", "noropenbookqa_nno_p2", "noropenbookqa_nno_p3", "noropenbookqa_nno_p4", "noropenbookqa_nob_p0", "noropenbookqa_nob_p1", "noropenbookqa_nob_p2", "noropenbookqa_nob_p3", "noropenbookqa_nob_p4", "norquad_p0", "norquad_p1", "norquad_p2", "norquad_p3", "norquad_p4", "norrewrite_instruct", "norsumm_nno_p0", "norsumm_nno_p1", "norsumm_nno_p2", "norsumm_nno_p3", "norsumm_nno_p4", "norsumm_nno_p5", "norsumm_nob_p0", "norsumm_nob_p1", "norsumm_nob_p2", "norsumm_nob_p3", "norsumm_nob_p4", "norsumm_nob_p5", "norsummarize_instruct", "nortruthfulqa_gen_nno_p0", "nortruthfulqa_gen_nno_p1", "nortruthfulqa_gen_nno_p2", "nortruthfulqa_gen_nno_p3", "nortruthfulqa_gen_nno_p4", "nortruthfulqa_gen_nob_p0", "nortruthfulqa_gen_nob_p1", "nortruthfulqa_gen_nob_p2", "nortruthfulqa_gen_nob_p3", "nortruthfulqa_gen_nob_p4", "nortruthfulqa_mc_nno_p0", "nortruthfulqa_mc_nno_p1", "nortruthfulqa_mc_nno_p2", "nortruthfulqa_mc_nno_p3", "nortruthfulqa_mc_nno_p4", "nortruthfulqa_mc_nob_p0", "nortruthfulqa_mc_nob_p1", "nortruthfulqa_mc_nob_p2", "nortruthfulqa_mc_nob_p3", "nortruthfulqa_mc_nob_p4", "nrk_quiz_qa_nno_p0", "nrk_quiz_qa_nno_p1", "nrk_quiz_qa_nno_p2", "nrk_quiz_qa_nno_p3", "nrk_quiz_qa_nno_p4", "nrk_quiz_qa_nob_p0", "nrk_quiz_qa_nob_p1", "nrk_quiz_qa_nob_p2", "nrk_quiz_qa_nob_p3", "nrk_quiz_qa_nob_p4", "tatoeba_eng_nno_p0", "tatoeba_eng_nno_p1", "tatoeba_eng_nno_p2", "tatoeba_eng_nno_p3", "tatoeba_eng_nob_p0", "tatoeba_eng_nob_p1", "tatoeba_eng_nob_p2", "tatoeba_eng_nob_p3", "tatoeba_nno_eng_p0", "tatoeba_nno_eng_p1", "tatoeba_nno_eng_p2", "tatoeba_nno_eng_p3", "tatoeba_nob_eng_p0", "tatoeba_nob_eng_p1", "tatoeba_nob_eng_p2", "tatoeba_nob_eng_p3"],
             "storycloze": ["xstorycloze_en"],
             "instructhumaneval": ["humaneval_instruct"],
+            # African language benchmarks
+            "afrimgsm": ["afrimgsm_amh_prompt_1", "afrimgsm_eng_prompt_1", "afrimgsm_fra_prompt_1", "afrimgsm_hau_prompt_1", "afrimgsm_ibo_prompt_1", "afrimgsm_kin_prompt_1", "afrimgsm_swa_prompt_1", "afrimgsm_yor_prompt_1"],
+            "afrimmlu": ["afrimmlu_direct_amh_prompt_1", "afrimmlu_direct_eng_prompt_1", "afrimmlu_direct_fra_prompt_1", "afrimmlu_direct_hau_prompt_1", "afrimmlu_direct_ibo_prompt_1", "afrimmlu_direct_kin_prompt_1", "afrimmlu_direct_swa_prompt_1", "afrimmlu_direct_yor_prompt_1"],
         }
         # Check if task is explicitly disabled

wisent/core/evaluators/benchmark_specific/apps_evaluator.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""
+APPS (Automated Programming Progress Standard) Evaluator.
+Script workflow:
+1. code = APPSEvaluator.extract_code_from_json(raw_response)
+2. code = AppsExtractor.prepend_imports(code)
+3. test_code, _ = AppsExtractor.build_test_code(input_output)
+4. result = evaluator.evaluate(code, expected=None, test_code=test_code)
+Dataset: codeparrot/apps (10,000 Python coding problems)
+Paper: https://arxiv.org/abs/2105.09938
+"""
+from __future__ import annotations
+import json
+import re
+from typing import Any
+from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
+from wisent.core.evaluators.benchmark_specific.coding.metrics.evaluator import CodingEvaluator
+# Import shared utilities from extractor
+from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.apps import AppsExtractor
+class APPSEvaluator(BaseEvaluator):
+    """Evaluator for APPS benchmark. Delegates to CodingEvaluator."""
+    name = "apps"
+    description = "APPS coding benchmark evaluator"
+    def __init__(self):
+        self._coding_evaluator = CodingEvaluator()
+    # --- Static helper methods for scripts ---
+    # Reuse from AppsExtractor
+    prepend_imports = staticmethod(AppsExtractor._prepend_imports)
+    build_test_code = staticmethod(AppsExtractor._build_test_code_from_io)
+    @staticmethod
+    def get_prompt(
+        question: str,
+        starter_code: str | None = None,
+        fn_name: str | None = None,
+    ) -> str:
+        """Generate prompt for the model.
+        Args:
+            question: Problem description
+            starter_code: Optional starter code template
+            fn_name: If provided, this is a call-based (LeetCode) problem
+        Returns:
+            Formatted prompt string
+        """
+        prompt = "You are an expert Python programmer. Solve the following coding problem.\n\n"
+        prompt += f"Problem:\n{question}\n\n"
+        if starter_code and starter_code.strip():
+            prompt += f"Starter code (you must use this):\n```python\n{starter_code}\n```\n\n"
+        prompt += 'Output your solution as a JSON object: {"code": "your_python_code_here"}\n'
+        if fn_name:
+            prompt += "Implement the Solution class with the required method.\n"
+        else:
+            prompt += "Your code should read from stdin and write to stdout.\n"
+        prompt += "\nRespond with ONLY the JSON object, no other text."
+        return prompt
+    @staticmethod
+    def extract_code_from_json(response: str) -> str | None:
+        """Extract code from JSON response."""
+        # Try direct JSON parse
+        try:
+            data = json.loads(response.strip())
+            if isinstance(data, dict) and "code" in data:
+                return data["code"]
+        except json.JSONDecodeError:
+            pass
+        # Try regex for multiline JSON
+        match = re.search(r'\{\s*"code"\s*:\s*"((?:[^"\\]|\\.)*)"\s*\}', response, re.DOTALL)
+        if match:
+            try:
+                data = json.loads(match.group(0))
+                if "code" in data:
+                    return data["code"]
+            except json.JSONDecodeError:
+                pass
+        # Fallback: code blocks
+        for pattern in [r'```python\n(.*?)\n```', r'```\n(.*?)\n```']:
+            match = re.search(pattern, response, re.DOTALL)
+            if match:
+                return match.group(1)
+        return None
+    # --- Evaluator interface ---
+    def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
+        """Evaluate code against test cases.
+        Args:
+            response: Already-extracted and prepared Python code
+            expected: Not used
+            **kwargs:
+                test_code: Already-generated test code
+        Returns:
+            EvalResult from CodingEvaluator
+        """
+        test_code = kwargs.get("test_code")
+        if not test_code:
+            return EvalResult(
+                ground_truth="UNKNOWN",
+                method_used=self.name,
+                confidence=0.0,
+                details="No test_code provided",
+            )
+        return self._coding_evaluator.evaluate(
+            response=response,
+            expected=None,
+            test_code=test_code,
+            entry_point=None,
+            task_name="apps",
+            language="python",
+        )

wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py CHANGED Viewed

@@ -223,11 +223,16 @@ if __name__ == "__main__":
         try:
             from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job
-            # Set limits based on task
+            # Set limits based on task or kwargs
+            timeout_override = kwargs.get('timeout')
             if 'ds1000' in task_name.lower() or 'ds_1000' in task_name.lower():
                 cpu_limit_s = 60
                 wall_timeout_s = 120
                 nproc = 512
+            elif timeout_override:
+                cpu_limit_s = timeout_override
+                wall_timeout_s = timeout_override
+                nproc = 64
             else:
                 cpu_limit_s = self.cfg.cpu_limit_s
                 wall_timeout_s = self.cfg.time_limit_s

wisent/core/evaluators/benchmark_specific/conala_evaluator.py CHANGED Viewed

@@ -12,13 +12,12 @@ Wang Ling et al., "Latent Predictor Networks for Code Generation" (2016)
 """
 import logging
-import math
 import re
-from collections import Counter
 from typing import Any
+import evaluate
 from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
-from wisent.core.evaluators.benchmark_specific.math_parsing.extract_boxed import extract_boxed_answer
 logger = logging.getLogger(__name__)
@@ -46,134 +45,6 @@ def tokenize_for_bleu_eval(code: str) -> list[str]:
     return tokens
-def _get_ngrams(segment: list[str], max_order: int) -> Counter:
-    """Extract n-grams up to max_order from a token list.
-    Args:
-        segment: List of tokens
-        max_order: Maximum n-gram order
-    Returns:
-        Counter of n-gram frequencies
-    """
-    ngram_counts = Counter()
-    for order in range(1, max_order + 1):
-        for i in range(len(segment) - order + 1):
-            ngram = tuple(segment[i:i + order])
-            ngram_counts[ngram] += 1
-    return ngram_counts
-def compute_bleu(
-    references: list[list[str]],
-    hypotheses: list[list[str]],
-    max_order: int = 4,
-    smooth: bool = False,
-) -> tuple[float, list[float], float, float, int, int]:
-    """Compute corpus-level BLEU score.
-    Implementation follows the CoNaLa baseline. CoNaLa has one reference per example.
-    Args:
-        references: List of reference token lists (one per example).
-        hypotheses: List of hypothesis token lists (one per example).
-        max_order: Maximum n-gram order to use (default 4).
-        smooth: Whether to apply Lin smoothing (default False for CoNaLa).
-    Returns:
-        Tuple of:
-            - BLEU score (0.0 to 1.0)
-            - List of n-gram precisions
-            - Brevity penalty
-            - Length ratio (hypothesis/reference)
-            - Hypothesis length
-            - Reference length
-    """
-    matches_by_order = [0] * max_order
-    possible_matches_by_order = [0] * max_order
-    reference_length = 0
-    hypothesis_length = 0
-    for reference, hypothesis in zip(references, hypotheses):
-        reference_length += len(reference)
-        hypothesis_length += len(hypothesis)
-        # Get n-grams
-        ref_ngrams = _get_ngrams(reference, max_order)
-        hyp_ngrams = _get_ngrams(hypothesis, max_order)
-        # Count matches (clipped to reference count)
-        overlap = hyp_ngrams & ref_ngrams
-        for ngram, count in overlap.items():
-            matches_by_order[len(ngram) - 1] += count
-        # Count possible matches
-        for order in range(1, max_order + 1):
-            possible_matches = len(hypothesis) - order + 1
-            if possible_matches > 0:
-                possible_matches_by_order[order - 1] += possible_matches
-    # Compute precisions
-    precisions = [0.0] * max_order
-    for i in range(max_order):
-        if smooth:
-            precisions[i] = (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
-        else:
-            if possible_matches_by_order[i] > 0:
-                precisions[i] = matches_by_order[i] / possible_matches_by_order[i]
-            else:
-                precisions[i] = 0.0
-    # Compute geometric mean of precisions
-    if min(precisions) > 0:
-        p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
-        geo_mean = math.exp(p_log_sum)
-    else:
-        geo_mean = 0.0
-    # Compute brevity penalty
-    ratio = hypothesis_length / reference_length if reference_length > 0 else 0.0
-    if ratio > 1.0:
-        bp = 1.0
-    elif ratio == 0.0:
-        bp = 0.0
-    else:
-        bp = math.exp(1.0 - 1.0 / ratio)
-    bleu = geo_mean * bp
-    return (bleu, precisions, bp, ratio, hypothesis_length, reference_length)
-def compute_bleu_single(
-    reference: str,
-    hypothesis: str,
-    max_order: int = 4,
-    smooth: bool = False,
-) -> float:
-    """Compute BLEU score for a single reference-hypothesis pair.
-    Args:
-        reference: The reference code string
-        hypothesis: The generated code string
-        max_order: Maximum n-gram order (default 4)
-        smooth: Whether to apply smoothing (default False)
-    Returns:
-        BLEU score (0.0 to 1.0)
-    """
-    ref_tokens = tokenize_for_bleu_eval(reference)
-    hyp_tokens = tokenize_for_bleu_eval(hypothesis)
-    bleu, _, _, _, _, _ = compute_bleu(
-        references=[ref_tokens],
-        hypotheses=[hyp_tokens],
-        max_order=max_order,
-        smooth=smooth,
-    )
-    return bleu
 class CoNaLaEvaluator(BaseEvaluator):
     """Evaluator for CoNaLa code generation benchmark.
@@ -198,12 +69,12 @@ class CoNaLaEvaluator(BaseEvaluator):
         """
         self.bleu_threshold = bleu_threshold
         self.max_order = max_order
+        self.bleu_metric = evaluate.load("bleu")
     @staticmethod
     def get_prompt(
         intent: str,
         rewritten_intent: str | None = None,
-        examples: list[tuple[str, str]] | None = None,
     ) -> str:
         """Create instruction prompt for LLM to generate Python code.
@@ -217,15 +88,7 @@ class CoNaLaEvaluator(BaseEvaluator):
         """
         nl_intent = rewritten_intent if rewritten_intent else intent
-        prompt = "Generate Python code for the following task. Put final answer - python code for the task, in \\boxed{}.\n Here are examples of correct answers:\n"
-        # Add few-shot examples if provided
-        if examples:
-            for ex_intent, ex_snippet in examples:
-                prompt += f"\nTask: {ex_intent}\n\\boxed{{{ex_snippet}}}\n"
+        prompt = "Generate Python code for the following task. Put final answer, in \\boxed{}."
         prompt += f"\nTask: {nl_intent}\n"
@@ -257,17 +120,15 @@ class CoNaLaEvaluator(BaseEvaluator):
         expected_str = str(expected).strip()
         response_str = response.strip()
-        # Tokenize for logging
-        ref_tokens = tokenize_for_bleu_eval(expected_str)
-        hyp_tokens = tokenize_for_bleu_eval(response_str)
-        # Compute BLEU score
-        bleu_score = compute_bleu_single(
-            reference=expected_str,
-            hypothesis=response_str,
+        # Compute BLEU score using HuggingFace evaluate
+        result = self.bleu_metric.compute(
+            predictions=[response_str],
+            references=[[expected_str]],
+            tokenizer=tokenize_for_bleu_eval,
             max_order=self.max_order,
             smooth=False,
         )
+        bleu_score = result["bleu"]
         # Determine truthfulness based on BLEU threshold only
         is_correct = bleu_score >= self.bleu_threshold
@@ -279,8 +140,6 @@ class CoNaLaEvaluator(BaseEvaluator):
             details=f"BLEU: {bleu_score:.4f}",
             meta={
                 "bleu_score": bleu_score,
-                "expected_tokens": ref_tokens,
-                "response_tokens": hyp_tokens,
                 "bleu_threshold": self.bleu_threshold,
             }
         )
@@ -304,29 +163,33 @@ class CoNaLaEvaluator(BaseEvaluator):
         if len(responses) != len(expected_answers):
             raise ValueError("Number of responses must match number of expected answers")
-        # Tokenize all pairs
-        references = []
-        hypotheses = []
-        for response, expected in zip(responses, expected_answers):
-            ref_tokens = tokenize_for_bleu_eval(str(expected).strip())
-            hyp_tokens = tokenize_for_bleu_eval(response.strip() if response else "")
-            references.append(ref_tokens)
-            hypotheses.append(hyp_tokens)
+        # Prepare predictions and references for HF BLEU
+        predictions = [r.strip() if r else "" for r in responses]
+        references = [[str(e).strip()] for e in expected_answers]
+        # Check if all predictions are empty (would cause division by zero)
+        if all(not p for p in predictions):
+            return {
+                "bleu_score": 0.0,
+                "total": len(responses),
+                "brevity_penalty": 0.0,
+                "length_ratio": 0.0,
+                "precisions": [0.0] * self.max_order,
+            }
-        # Compute corpus BLEU
-        bleu, precisions, bp, ratio, _, _ = compute_bleu(
+        # Compute corpus BLEU using HuggingFace evaluate
+        result = self.bleu_metric.compute(
+            predictions=predictions,
             references=references,
-            hypotheses=hypotheses,
+            tokenizer=tokenize_for_bleu_eval,
             max_order=self.max_order,
             smooth=False,
         )
         return {
-            "bleu_score": bleu * 100,  # Convert to percentage like leaderboard
+            "bleu_score": result["bleu"] * 100,  # Convert to percentage like leaderboard
             "total": len(responses),
-            "brevity_penalty": bp,
-            "length_ratio": ratio,
-            "precisions": precisions,
+            "brevity_penalty": result["brevity_penalty"],
+            "length_ratio": result["length_ratio"],
+            "precisions": result["precisions"],
         }

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl