PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (725) hide show

wisent/core/contrastive_pairs/diagnostics/linearity.py CHANGED Viewed

@@ -8,8 +8,7 @@ from enum import Enum
 import torch
-from wisent.core.activations.core.atoms import ActivationAggregationStrategy
-from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
+from wisent.core.activations.extraction_strategy import ExtractionStrategy
 class LinearityVerdict(Enum):
@@ -35,11 +34,8 @@ class LinearityConfig:
     layers_to_test: Optional[List[int]] = None
     """Specific layers to test. If None, tests sample across depth."""
-    aggregation_strategies: Optional[List[ActivationAggregationStrategy]] = None
-    """Aggregation strategies to test. If None, tests all."""
-    prompt_strategies: Optional[List[PromptConstructionStrategy]] = None
-    """Prompt strategies to test. If None, tests all."""
+    extraction_strategies: Optional[List[ExtractionStrategy]] = None
+    """Extraction strategies to test. If None, tests default set."""
     normalize_options: List[bool] = field(default_factory=lambda: [False, True])
     """Normalization options to test."""
@@ -128,24 +124,15 @@ def check_linearity(
     else:
         layers_to_test = cfg.layers_to_test
-    # Determine aggregation strategies
-    if cfg.aggregation_strategies is None:
-        aggregation_strategies = [
-            ActivationAggregationStrategy.LAST_TOKEN,
-            ActivationAggregationStrategy.MEAN_POOLING,
-            ActivationAggregationStrategy.MAX_POOLING,
-        ]
-    else:
-        aggregation_strategies = cfg.aggregation_strategies
-    # Determine prompt strategies
-    if cfg.prompt_strategies is None:
-        prompt_strategies = [
-            PromptConstructionStrategy.CHAT_TEMPLATE,
-            PromptConstructionStrategy.DIRECT_COMPLETION,
+    # Determine extraction strategies
+    if cfg.extraction_strategies is None:
+        extraction_strategies = [
+            ExtractionStrategy.CHAT_LAST,
+            ExtractionStrategy.CHAT_MEAN,
+            ExtractionStrategy.CHAT_MAX_NORM,
         ]
     else:
-        prompt_strategies = cfg.prompt_strategies
+        extraction_strategies = cfg.extraction_strategies
     # Limit pairs
     test_pairs = pairs[:cfg.max_pairs]
@@ -157,62 +144,59 @@ def check_linearity(
     all_results = []
-    for prompt_strategy in prompt_strategies:
-        for agg_strategy in aggregation_strategies:
-            for normalize in cfg.normalize_options:
-                # Collect activations
-                pos_activations = {l: [] for l in layers_to_test}
-                neg_activations = {l: [] for l in layers_to_test}
-                for pair in test_pairs:
-                    try:
-                        pair_with_acts = collector.collect_for_pair(
-                            pair,
-                            layers=[str(l) for l in layers_to_test],
-                            aggregation=agg_strategy,
-                            normalize_layers=normalize,
-                            prompt_strategy=prompt_strategy,
-                        )
-                        pos_la = pair_with_acts.positive_response.layers_activations
-                        neg_la = pair_with_acts.negative_response.layers_activations
-                        if pos_la and neg_la:
-                            for layer in layers_to_test:
-                                pos_t = pos_la.get(str(layer))
-                                neg_t = neg_la.get(str(layer))
-                                if pos_t is not None and neg_t is not None:
-                                    pos_activations[layer].append(pos_t.flatten().cpu())
-                                    neg_activations[layer].append(neg_t.flatten().cpu())
-                    except Exception:
-                        continue
-                # Analyze each layer
-                for layer in layers_to_test:
-                    pos_list = pos_activations[layer]
-                    neg_list = neg_activations[layer]
-                    if len(pos_list) < 10 or len(neg_list) < 10:
-                        continue
+    for strategy in extraction_strategies:
+        for normalize in cfg.normalize_options:
+            # Collect activations
+            pos_activations = {l: [] for l in layers_to_test}
+            neg_activations = {l: [] for l in layers_to_test}
+            for pair in test_pairs:
+                try:
+                    pair_with_acts = collector.collect(
+                        pair,
+                        strategy=strategy,
+                        layers=[str(l) for l in layers_to_test],
+                        normalize=normalize,
+                    )
-                    pos_tensor = torch.stack(pos_list)
-                    neg_tensor = torch.stack(neg_list)
+                    pos_la = pair_with_acts.positive_response.layers_activations
+                    neg_la = pair_with_acts.negative_response.layers_activations
-                    result = detect_geometry_structure(pos_tensor, neg_tensor, geo_config)
-                    linear_score = result.all_scores["linear"].score
-                    linear_details = result.all_scores["linear"].details
-                    all_results.append({
-                        "prompt_strategy": prompt_strategy.name,
-                        "aggregation": agg_strategy.name,
-                        "normalize": normalize,
-                        "layer": layer,
-                        "linear_score": linear_score,
-                        "cohens_d": linear_details.get("cohens_d", 0),
-                        "variance_explained": linear_details.get("variance_explained", 0),
-                        "best_structure": result.best_structure.value,
-                    })
+                    if pos_la and neg_la:
+                        for layer in layers_to_test:
+                            pos_t = pos_la.get(str(layer))
+                            neg_t = neg_la.get(str(layer))
+                            if pos_t is not None and neg_t is not None:
+                                pos_activations[layer].append(pos_t.flatten().cpu())
+                                neg_activations[layer].append(neg_t.flatten().cpu())
+                except Exception:
+                    continue
+            # Analyze each layer
+            for layer in layers_to_test:
+                pos_list = pos_activations[layer]
+                neg_list = neg_activations[layer]
+                if len(pos_list) < 10 or len(neg_list) < 10:
+                    continue
+                pos_tensor = torch.stack(pos_list)
+                neg_tensor = torch.stack(neg_list)
+                result = detect_geometry_structure(pos_tensor, neg_tensor, geo_config)
+                linear_score = result.all_scores["linear"].score
+                linear_details = result.all_scores["linear"].details
+                all_results.append({
+                    "extraction_strategy": strategy.value,
+                    "normalize": normalize,
+                    "layer": layer,
+                    "linear_score": linear_score,
+                    "cohens_d": linear_details.get("cohens_d", 0),
+                    "variance_explained": linear_details.get("variance_explained", 0),
+                    "best_structure": result.best_structure.value,
+                })
     if not all_results:
         return LinearityResult(
@@ -234,7 +218,7 @@ def check_linearity(
         verdict = LinearityVerdict.LINEAR
         recommendation = (
             f"Use CAA (single-direction steering) on layer {best['layer']} "
-            f"with {best['prompt_strategy']} prompt and {best['aggregation']} aggregation."
+            f"with {best['extraction_strategy']} strategy."
         )
     elif best["linear_score"] >= cfg.weak_threshold and best["cohens_d"] >= cfg.min_cohens_d:
         verdict = LinearityVerdict.WEAKLY_LINEAR
@@ -254,8 +238,7 @@ def check_linearity(
         verdict=verdict,
         best_linear_score=best["linear_score"],
         best_config={
-            "prompt_strategy": best["prompt_strategy"],
-            "aggregation": best["aggregation"],
+            "extraction_strategy": best["extraction_strategy"],
             "normalize": best["normalize"],
         },
         best_layer=best["layer"],

wisent/core/contrastive_pairs/diagnostics/vector_quality.py CHANGED Viewed

@@ -281,7 +281,8 @@ def _compute_pca(
         n_components = min(5, n - 1)
         pca = PCA(n_components=n_components)
-        pca.fit(difference_vectors.numpy())
+        # Convert to float32 for sklearn compatibility (BFloat16 not supported)
+        pca.fit(difference_vectors.float().numpy())
         pc1_var = pca.explained_variance_ratio_[0]
         pc2_var = pca.explained_variance_ratio_[1] if n_components > 1 else 0.0
@@ -372,7 +373,7 @@ def _compute_clustering(
     try:
         from sklearn.metrics import silhouette_score
-        all_activations = torch.cat([positive_activations, negative_activations], dim=0).numpy()
+        all_activations = torch.cat([positive_activations, negative_activations], dim=0).float().numpy()
         labels = [0] * n_pos + [1] * n_neg
         silhouette = silhouette_score(all_activations, labels)
@@ -436,7 +437,7 @@ def _compute_cv_classification(
         from sklearn.linear_model import LogisticRegression
         from sklearn.model_selection import cross_val_score
-        X = torch.cat([positive_activations, negative_activations], dim=0).numpy()
+        X = torch.cat([positive_activations, negative_activations], dim=0).float().numpy()
         y = np.array([1] * n_pos + [0] * n_neg)
         n_folds = min(config.cv_folds, min(n_pos, n_neg))
@@ -473,8 +474,8 @@ def _compute_cohens_d(
     direction = direction / direction_norm
     # Project all activations onto this direction
-    pos_proj = (positive_activations @ direction).numpy()
-    neg_proj = (negative_activations @ direction).numpy()
+    pos_proj = (positive_activations @ direction).float().numpy()
+    neg_proj = (negative_activations @ direction).float().numpy()
     # Cohen's d = (mean1 - mean2) / pooled_std
     mean_diff = pos_proj.mean() - neg_proj.mean()

wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py CHANGED Viewed

@@ -133,14 +133,11 @@ EXTRACTORS: dict[str, str] = {
     # Coding benchmarks
     "humaneval": f"{base_import}humaneval:HumanEvalExtractor",
-    "humaneval_plus": f"{base_import}humaneval:HumanEvalExtractor",
-    "humaneval_64_instruct": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
-    "humaneval_instruct": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
-    "humanevalpack": f"{base_import}humaneval:HumanEvalExtractor",
-    "instructhumaneval": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
-    "mbpp": f"{base_import}mbpp:MBPPExtractor",
-    "mbpp_plus": f"{base_import}mbpp:MBPPExtractor",
-    "instruct_humaneval": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
+    "humaneval_64": f"{base_import}humaneval:HumanEval64Extractor",
+    "humaneval_plus": f"{base_import}humaneval:HumanEvalPlusExtractor",
+    "humaneval_instruct": f"{base_import}humaneval:HumanEvalInstructExtractor",
+    "humaneval_64_instruct": f"{base_import}humaneval:HumanEval64InstructExtractor",
+    "humanevalpack": f"{base_import}humanevalpack:HumanevalpackExtractor",
     "apps": f"{base_import}apps:AppsExtractor",
     "conala": f"{base_import}conala:ConalaExtractor",
     "concode": f"{base_import}concode:ConcodeExtractor",
@@ -156,13 +153,6 @@ EXTRACTORS: dict[str, str] = {
     "multiple_rs": f"{base_import}multipl_e:MultiplEExtractor",
     "multiple_go": f"{base_import}multipl_e:MultiplEExtractor",
     "codexglue": f"{base_import}codexglue:CodexglueExtractor",
-    "code_x_glue": f"{base_import}codexglue:CodexglueExtractor",
-    "codexglue_code_to_text_python": f"{base_import}codexglue:CodexglueExtractor",
-    "codexglue_code_to_text_go": f"{base_import}codexglue:CodexglueExtractor",
-    "codexglue_code_to_text_ruby": f"{base_import}codexglue:CodexglueExtractor",
-    "codexglue_code_to_text_java": f"{base_import}codexglue:CodexglueExtractor",
-    "codexglue_code_to_text_javascript": f"{base_import}codexglue:CodexglueExtractor",
-    "codexglue_code_to_text_php": f"{base_import}codexglue:CodexglueExtractor",
     "livecodebench": f"{base_import}livecodebench:LivecodebenchExtractor",
     # Reasoning benchmarks
@@ -203,7 +193,6 @@ EXTRACTORS: dict[str, str] = {
     "ds1000": f"{base_import}ds1000:Ds1000Extractor",
     "evalita_mp": f"{base_import}evalita_mp:EvalitaMpExtractor",
     "flores": f"{base_import}flores:FloresExtractor",
-    "freebase": f"{base_import}freebase:FreebaseExtractor",
     "humanevalpack": f"{base_import}humanevalpack:HumanevalpackExtractor",
     "iwslt2017_ar_en": f"{base_import}iwslt2017_ar_en:Iwslt2017ArEnExtractor",
     "iwslt2017_en_ar": f"{base_import}iwslt2017_en_ar:Iwslt2017EnArExtractor",
@@ -229,11 +218,8 @@ EXTRACTORS: dict[str, str] = {
     "flan_held_in": f"{base_import}flan_held_in:FlanHeldInExtractor",
     "gpt3_translation_benchmarks": f"{base_import}gpt3_translation_benchmarks:Gpt3TranslationBenchmarksExtractor",
     "multiple_choice": f"{base_import}multiple_choice:MultipleChoiceExtractor",
-    "non_greedy_robustness_agieval_aqua_rat": f"{base_import}non_greedy_robustness_agieval_aqua_rat:NonGreedyRobustnessAgievalAquaRatExtractor",
-    "option_order_robustness_agieval_aqua_rat": f"{base_import}option_order_robustness_agieval_aqua_rat:OptionOrderRobustnessAgievalAquaRatExtractor",
     "penn_treebank": f"{base_import}penn_treebank:PennTreebankExtractor",
     "ptb": f"{base_import}penn_treebank:PennTreebankExtractor",
-    "prompt_robustness_agieval_aqua_rat": f"{base_import}prompt_robustness_agieval_aqua_rat:PromptRobustnessAgievalAquaRatExtractor",
     "self_consistency": f"{base_import}self_consistency:SelfConsistencyExtractor",
     "t0_eval": f"{base_import}t0_eval:T0EvalExtractor",
     "vaxx_stance": f"{base_import}vaxx_stance:VaxxStanceExtractor",

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py CHANGED Viewed

@@ -8,12 +8,16 @@ from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.concode
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.ds_1000 import Ds1000Extractor
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.hle import HleExtractor
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.hmmt import HMMTExtractor
-from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.humaneval import HumanEvalExtractor
-from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.instructhumaneval import InstructHumanEvalExtractor
+from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.humaneval import (
+    HumanEvalExtractor,
+    HumanEval64Extractor,
+    HumanEvalPlusExtractor,
+    HumanEvalInstructExtractor,
+    HumanEval64InstructExtractor,
+)
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.livecodebench import LivecodebenchExtractor
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.livemathbench import LiveMathBenchExtractor
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.math500 import MATH500Extractor
-from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.mbpp import MBPPExtractor
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.mercury import MercuryExtractor
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.multipl_e import MultiplEExtractor
 from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.polymath import PolyMathExtractor
@@ -30,11 +34,13 @@ __all__ = [
     "HleExtractor",
     "HMMTExtractor",
     "HumanEvalExtractor",
-    "InstructHumanEvalExtractor",
+    "HumanEval64Extractor",
+    "HumanEvalPlusExtractor",
+    "HumanEvalInstructExtractor",
+    "HumanEval64InstructExtractor",
     "LivecodebenchExtractor",
     "LiveMathBenchExtractor",
     "MATH500Extractor",
-    "MBPPExtractor",
     "MercuryExtractor",
     "MultiplEExtractor",
     "PolyMathExtractor",

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from __future__ import annotations
+import json
+import random
+import re
 from typing import Any
 from wisent.core.cli_logger import setup_logger
-import json
 from wisent.core.contrastive_pairs.core.pair import ContrastivePair
 from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
@@ -88,6 +91,9 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
                 log.debug(f"Could not parse solutions array: {e}")
                 return None
+            # Prepend common imports (APPS solutions assume LeetCode-style environment)
+            correct_answer = self._prepend_imports(correct_answer)
             # Create incorrect answer (modify or corrupt)
             incorrect_answer = self._create_incorrect_answer(correct_answer)
@@ -96,10 +102,11 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
             # Parse input_output JSON to create test code
             test_code = None
+            entry_point = None
             if input_output:
                 try:
                     io_data = json.loads(input_output) if isinstance(input_output, str) else input_output
-                    test_code = self._build_test_code_from_io(io_data)
+                    test_code, entry_point = self._build_test_code_from_io(io_data)
                 except (json.JSONDecodeError, TypeError) as e:
                     log.debug(f"Could not parse input_output: {e}")
@@ -107,6 +114,8 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
                 "label": "apps",
                 "source": "codeparrot/apps",
                 "test_code": test_code,
+                "entry_point": entry_point,
+                "language": "python",
             }
             return self._build_pair(
@@ -120,29 +129,82 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
             log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
             return None
-    def _build_test_code_from_io(self, io_data: dict) -> str:
+    @staticmethod
+    def _build_test_code_from_io(io_data: dict) -> tuple[str, str | None]:
         """Build test code from input/output data.
-        APPS solutions are script-style (stdin/stdout), not functions.
-        We use subprocess to run solution.py with the input.
+        APPS has two types of problems:
+        1. stdin/stdout: No fn_name, run via subprocess
+        2. call-based: Has fn_name, import and call Solution().fn_name()
+        Returns:
+            Tuple of (test_code, entry_point)
         """
         inputs = io_data.get("inputs", [])
         outputs = io_data.get("outputs", [])
-        if not inputs or not outputs:
-            return None
+        fn_name = io_data.get("fn_name")
-        # Build test code that runs solution.py as a subprocess
-        # Include normalize function to handle whitespace differences in APPS dataset
-        test_code = '''import subprocess
+        if not inputs or not outputs:
+            return None, None
+        if fn_name:
+            return AppsExtractor._build_call_based_test_code(inputs, outputs, fn_name)
+        else:
+            return AppsExtractor._build_stdin_test_code(inputs, outputs)
+    @staticmethod
+    def _build_call_based_test_code(
+        inputs: list, outputs: list, fn_name: str
+    ) -> tuple[str, None]:
+        """Build test code for call-based (LeetCode-style) problems."""
+        total = len(inputs)
+        test_code = f'''import sys
+from solution import Solution
+from typing import List, Optional, Dict, Tuple, Set, Any
+def compare_outputs(actual, expected):
+    """Compare outputs, handling floating point and nested structures."""
+    if isinstance(expected, float) and isinstance(actual, float):
+        return abs(actual - expected) < 1e-6
+    if isinstance(expected, list) and isinstance(actual, list):
+        if len(expected) != len(actual):
+            return False
+        return all(compare_outputs(a, e) for a, e in zip(actual, expected))
+    return actual == expected
+if __name__ == '__main__':
+    sol = Solution()
+    passed = 0
+    total = {total}
+'''
+        for i, (inp, out) in enumerate(zip(inputs, outputs)):
+            # inp is typically a list of arguments
+            if isinstance(inp, list):
+                args_repr = ", ".join(repr(arg) for arg in inp)
+            else:
+                args_repr = repr(inp)
+            test_code += f"    # Test case {i+1}\n"
+            test_code += f"    try:\n"
+            test_code += f"        result = sol.{fn_name}({args_repr})\n"
+            test_code += f"        expected = {repr(out)}\n"
+            test_code += f"        if compare_outputs(result, expected):\n"
+            test_code += f"            passed += 1\n"
+            test_code += f"    except Exception:\n"
+            test_code += f"        pass\n\n"
+        test_code += "    print(f'PASSED:{passed}/{total}')\n"
+        test_code += "    sys.exit(0 if passed == total else 1)\n"
+        return test_code, None
+    @staticmethod
+    def _build_stdin_test_code(inputs: list, outputs: list) -> tuple[str, None]:
+        """Build test code for stdin/stdout style problems."""
+        total = len(inputs)
+        test_code = f'''import subprocess
 import sys
 def normalize_output(s):
-    """Normalize output by stripping trailing whitespace from each line.
-    APPS dataset has inconsistent trailing whitespace in expected outputs.
-    This normalizes both actual and expected to enable fair comparison.
-    """
+    """Normalize output by stripping trailing whitespace from each line."""
     lines = s.split('\\n')
     normalized = '\\n'.join(line.rstrip() for line in lines)
     return normalized.strip()
@@ -157,26 +219,78 @@ def run_solution(input_str):
         timeout=10
     )
     if result.returncode != 0:
-        raise RuntimeError(f"Solution failed: {result.stderr}")
+        raise RuntimeError(f"Solution failed: {{result.stderr}}")
     return result.stdout
+if __name__ == '__main__':
+    passed = 0
+    total = {total}
 '''
-        test_code += "if __name__ == '__main__':\n"
         for i, (inp, out) in enumerate(zip(inputs, outputs)):
             test_code += f"    # Test case {i+1}\n"
-            test_code += f"    result = run_solution({repr(inp)})\n"
-            test_code += f"    expected = {repr(out)}\n"
-            test_code += f"    assert normalize_output(result) == normalize_output(expected), f'Test {i+1} failed: expected {{repr(expected)}}, got {{repr(result)}}'\n\n"
-        test_code += "    print('All tests passed!')\n"
-        return test_code
+            test_code += f"    try:\n"
+            test_code += f"        result = run_solution({repr(inp)})\n"
+            test_code += f"        expected = {repr(out)}\n"
+            test_code += f"        if normalize_output(result) == normalize_output(expected):\n"
+            test_code += f"            passed += 1\n"
+            test_code += f"    except Exception:\n"
+            test_code += f"        pass\n\n"
+        test_code += "    print(f'PASSED:{passed}/{total}')\n"
+        test_code += "    sys.exit(0 if passed == total else 1)\n"
+        return test_code, None
+    # Common imports for LeetCode-style solutions
+    COMMON_IMPORTS = """\
+from typing import List, Optional, Dict, Tuple, Set, Any
+import collections
+import heapq
+import itertools
+import functools
+import math
+import bisect
+from collections import defaultdict, Counter, deque
+"""
+    @staticmethod
+    def _prepend_imports(code: str) -> str:
+        """Prepend common imports to solution code.
+        APPS solutions assume LeetCode-style environment where
+        List, collections, heapq, etc. are pre-imported.
+        """
+        # Skip if code already has typing imports
+        if "from typing import" in code or "import typing" in code:
+            return code
+        return AppsExtractor.COMMON_IMPORTS + code
     def _create_incorrect_answer(self, correct: str) -> str:
-        """Create an incorrect answer by modifying the correct one."""
-        # For code, corrupt it slightly
-        if len(correct) > 10:
-            return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
-        return f"{correct} # INCORRECT"
+        """Create an incorrect answer by shuffling letters in words.
+        This reliably breaks code by corrupting variable/function names,
+        causing NameError or SyntaxError.
+        """
+        def shuffle_word(word: str) -> str:
+            """Shuffle all letters in a word."""
+            if len(word) <= 2:
+                return word
+            letters = list(word)
+            random.shuffle(letters)
+            shuffled = ''.join(letters)
+            if shuffled == word:
+                return word[::-1]  # Reverse if shuffle didn't change
+            return shuffled
+        def replace_word(match: re.Match) -> str:
+            word = match.group(0)
+            return shuffle_word(word)
+        # Shuffle words with 3+ characters
+        result = re.sub(r'[A-Za-z]{3,}', replace_word, correct)
+        # If nothing changed (all short words), append syntax error
+        if result == correct:
+            result = correct + "\n!!SYNTAX_ERROR!!"
+        return result

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py CHANGED Viewed

@@ -20,7 +20,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
         - code: str (code answer/solution)
     """
-    evaluator_name = "generation"  # Text similarity for code-to-text tasks
+    evaluator_name = "generation"
     def extract_contrastive_pairs(
         self,
@@ -82,7 +82,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
             incorrect_answer = self._create_incorrect_answer(correct_answer)
             # Format the question
-            formatted_question = f"Question: {question}\n\nWhat is the answer?"
+            formatted_question = f"{question}\n\nGenerate code based on description:"
             metadata = {
                 "label": "codexglue",

wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl