PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import requests
 from typing import Any
 from wisent.core.cli_logger import setup_logger
@@ -10,45 +11,32 @@ __all__ = ["AiderPolyglotExtractor"]
 log = setup_logger(__name__)
+# GitHub API base URL for Aider Polyglot benchmark
+AIDER_GITHUB_API = "https://api.github.com/repos/Aider-AI/polyglot-benchmark/contents"
 # Languages supported by Aider Polyglot benchmark
-AIDER_POLYGLOT_LANGUAGES = [
-    "python",
-    "javascript",
-    "java",
-    "cpp",
-    "go",
-    "rust",
-]
+AIDER_POLYGLOT_LANGUAGES = ["python", "javascript", "java", "cpp", "go", "rust"]
 class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
     """
-    Extractor for Aider Polyglot-style code editing benchmarks.
+    Extractor for Aider Polyglot benchmark.
+    GitHub: https://github.com/Aider-AI/polyglot-benchmark
     Aider's polyglot benchmark tests LLMs on 225 challenging Exercism coding
-    exercises across C++, Go, Java, JavaScript, Python, and Rust. This extractor
-    uses the jinaai/code_exercises dataset which provides similar code exercise
-    problems in Python.
+    exercises across C++, Go, Java, JavaScript, Python, and Rust.
-    The benchmark evaluates:
-    - Code generation from docstrings
-    - Code editing and completion
-    - Multi-turn correction (fixing failed attempts)
+    Structure per exercise:
+    - .docs/instructions.md - problem description
+    - .meta/example.py - reference solution
+    - {name}_test.py - test cases
     For code editing:
-    - Positive (correct) = Working solution that passes tests
+    - Positive (correct) = Working solution from .meta/example.py
     - Negative (incorrect) = Buggy or incomplete solution
-    Schema (jinaai/code_exercises):
-        - problem: str (function signature with docstring)
-        - solution: str (complete solution implementation)
-    Note: The original Aider Polyglot benchmark is hosted on GitHub at
-    github.com/Aider-AI/polyglot-benchmark. This extractor uses HuggingFace
-    alternatives with similar structure.
     """
-    # Evaluator that should be used for this benchmark
     evaluator_name = "code_editing"
     def __init__(self, language: str = "python"):
@@ -56,9 +44,11 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
         Initialize Aider Polyglot extractor.
         Args:
-            language: Target programming language (currently python supported)
+            language: Target programming language (python, javascript, java, cpp, go, rust)
         """
         super().__init__()
+        if language not in AIDER_POLYGLOT_LANGUAGES:
+            raise ValueError(f"Language must be one of {AIDER_POLYGLOT_LANGUAGES}")
         self.language = language
     def extract_contrastive_pairs(
@@ -66,48 +56,21 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
         limit: int | None = None,
     ) -> list[ContrastivePair]:
         """
-        Build contrastive pairs from code exercise examples.
-        For code editing:
-        - Positive (correct) = Working solution
-        - Negative (incorrect) = Buggy or incomplete solution
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
+        Build contrastive pairs from Aider Polyglot GitHub repository.
         """
         max_items = self._normalize_limit(limit)
+        pairs: list[ContrastivePair] = []
-        # Try primary dataset
-        try:
-            docs = self.load_dataset(
-                dataset_name="jinaai/code_exercises",
-                split="train",
-                limit=max_items,
-            )
-            dataset_source = "jinaai/code_exercises"
-            log.info(f"Loaded {len(docs)} examples from {dataset_source}")
-        except Exception as e:
-            log.warning(f"Failed to load jinaai/code_exercises: {e}")
-            # Try alternative dataset
-            try:
-                docs = self.load_dataset(
-                    dataset_name="synapse-alpha/coding_exercises",
-                    split="train",
-                    limit=max_items,
-                )
-                dataset_source = "synapse-alpha/coding_exercises"
-                log.info(f"Loaded {len(docs)} examples from {dataset_source}")
-            except Exception as e2:
-                log.error(f"Failed to load any code exercises dataset: {e2}")
-                return []
+        exercises = self._load_exercises_from_github()
+        if not exercises:
+            log.error("Failed to load exercises from Aider Polyglot GitHub")
+            return []
-        pairs: list[ContrastivePair] = []
+        log.info(f"Loaded {len(exercises)} exercises from Aider Polyglot GitHub")
-        for doc in docs:
-            pair = self._extract_pair_from_doc(doc, dataset_source)
+        for exercise in exercises:
+            pair = self._extract_pair_from_exercise(exercise)
             if pair is not None:
                 pairs.append(pair)
                 if max_items is not None and len(pairs) >= max_items:
@@ -118,36 +81,94 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
         return pairs
-    def _extract_pair_from_doc(
-        self,
-        doc: dict[str, Any],
-        source: str,
-    ) -> ContrastivePair | None:
-        """
-        Convert a single doc into a ContrastivePair.
+    def _load_exercises_from_github(self) -> list[dict[str, Any]]:
+        """Load exercises from Aider Polyglot GitHub repository."""
+        try:
+            # Get list of exercises
+            exercises_url = f"{AIDER_GITHUB_API}/{self.language}/exercises/practice"
+            response = requests.get(exercises_url, timeout=30)
+            response.raise_for_status()
+            exercise_dirs = response.json()
+            exercises = []
+            for exercise_dir in exercise_dirs:
+                if exercise_dir.get("type") != "dir":
+                    continue
+                exercise_name = exercise_dir.get("name", "")
+                exercise_path = exercise_dir.get("path", "")
+                # Load instructions and solution
+                exercise_data = self._load_exercise_data(exercise_name, exercise_path)
+                if exercise_data:
+                    exercises.append(exercise_data)
+            return exercises
+        except Exception as e:
+            log.error(f"Failed to load exercises from GitHub: {e}")
+            return []
-        Returns None when required fields are missing or malformed.
-        """
+    def _load_exercise_data(self, name: str, path: str) -> dict[str, Any] | None:
+        """Load a single exercise's instructions and solution."""
         try:
-            problem = doc.get("problem", "").strip()
-            solution = doc.get("solution", "").strip()
+            base_url = "https://raw.githubusercontent.com/Aider-AI/polyglot-benchmark/main"
+            # Load instructions
+            instructions_url = f"{base_url}/{path}/.docs/instructions.md"
+            instructions_resp = requests.get(instructions_url, timeout=15)
+            if instructions_resp.status_code != 200:
+                return None
+            instructions = instructions_resp.text
+            # Load solution - file extension depends on language
+            ext_map = {
+                "python": "py", "javascript": "js", "java": "java",
+                "cpp": "cpp", "go": "go", "rust": "rs"
+            }
+            ext = ext_map.get(self.language, "py")
+            solution_url = f"{base_url}/{path}/.meta/example.{ext}"
+            solution_resp = requests.get(solution_url, timeout=15)
+            if solution_resp.status_code != 200:
+                return None
+            solution = solution_resp.text
+            return {
+                "name": name,
+                "instructions": instructions,
+                "solution": solution,
+                "path": path,
+            }
+        except Exception as e:
+            log.debug(f"Failed to load exercise {name}: {e}")
+            return None
-            if not problem or not solution:
-                log.debug("Skipping: missing problem or solution")
+    def _extract_pair_from_exercise(self, exercise: dict[str, Any]) -> ContrastivePair | None:
+        """Convert an exercise into a ContrastivePair."""
+        try:
+            name = exercise.get("name", "")
+            instructions = exercise.get("instructions", "").strip()
+            solution = exercise.get("solution", "").strip()
+            if not instructions or not solution:
                 return None
-            # Build the prompt
-            prompt = self._build_prompt(problem)
+            prompt = f"""Coding Exercise: {name.replace('-', ' ').title()}
-            # Correct response is the working solution
-            correct_response = self._create_correct_response(solution)
+{instructions}
+Please provide the complete implementation."""
-            # Incorrect response is a buggy version
-            incorrect_response = self._create_incorrect_response(problem, solution)
+            correct_response = f"```{self.language}\n{solution}\n```"
+            incorrect_response = self._create_incorrect_response(solution)
             metadata = {
                 "label": "aider_polyglot",
-                "source": source,
+                "source": "Aider-AI/polyglot-benchmark",
+                "exercise_name": name,
                 "language": self.language,
                 "is_code_benchmark": True,
                 "is_code_editing_benchmark": True,
@@ -161,65 +182,21 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
             )
         except Exception as exc:
-            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
+            log.error(f"Error extracting pair: {exc}", exc_info=True)
             return None
-    def _build_prompt(self, problem: str) -> str:
-        """Build the code editing prompt."""
-        return f"""Complete the following Python function based on its docstring.
-{problem}
-Please provide the complete implementation."""
-    def _create_correct_response(self, solution: str) -> str:
-        """Create the correct response with working solution."""
-        return f"""Here is the complete implementation:
-```python
-{solution}
-```
-This solution correctly implements the function according to the docstring specification."""
-    def _create_incorrect_response(self, problem: str, solution: str) -> str:
+    def _create_incorrect_response(self, solution: str) -> str:
         """Create an incorrect response with common bugs."""
-        # Extract function name from problem if possible
-        func_name = "the function"
-        if "def " in problem:
-            try:
-                func_part = problem.split("def ")[1]
-                func_name = func_part.split("(")[0]
-            except (IndexError, AttributeError):
-                pass
-        # Create a buggy version by introducing common errors
-        buggy_solution = self._introduce_bugs(solution)
-        return f"""Here is my implementation:
-```python
-{buggy_solution}
-```
-Note: This implementation may have issues:
-- Missing edge case handling
-- Potential off-by-one errors
-- Incomplete logic"""
-    def _introduce_bugs(self, solution: str) -> str:
-        """Introduce common bugs into a solution."""
         lines = solution.split("\n")
         if len(lines) > 3:
-            # Remove a line to create incomplete logic
             middle_idx = len(lines) // 2
-            buggy_lines = lines[:middle_idx] + ["    pass  # TODO: complete implementation"] + lines[middle_idx+2:]
-            return "\n".join(buggy_lines)
+            buggy_lines = lines[:middle_idx] + ["    pass  # TODO: incomplete"] + lines[middle_idx+2:]
+            buggy = "\n".join(buggy_lines)
         elif lines:
-            # For short solutions, replace with pass
-            first_line = lines[0] if lines else "def func():"
-            return f"{first_line}\n    pass  # Implementation incomplete"
+            buggy = f"{lines[0]}\n    pass  # Implementation incomplete"
         else:
-            return "pass  # No implementation"
+            buggy = "pass  # No implementation"
+        return f"```{self.language}\n{buggy}\n```"

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from __future__ import annotations
+import json
+import random
+import re
 from typing import Any
 from wisent.core.cli_logger import setup_logger
-import json
 from wisent.core.contrastive_pairs.core.pair import ContrastivePair
 from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
@@ -88,6 +91,9 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
                 log.debug(f"Could not parse solutions array: {e}")
                 return None
+            # Prepend common imports (APPS solutions assume LeetCode-style environment)
+            correct_answer = self._prepend_imports(correct_answer)
             # Create incorrect answer (modify or corrupt)
             incorrect_answer = self._create_incorrect_answer(correct_answer)
@@ -96,10 +102,11 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
             # Parse input_output JSON to create test code
             test_code = None
+            entry_point = None
             if input_output:
                 try:
                     io_data = json.loads(input_output) if isinstance(input_output, str) else input_output
-                    test_code = self._build_test_code_from_io(io_data)
+                    test_code, entry_point = self._build_test_code_from_io(io_data)
                 except (json.JSONDecodeError, TypeError) as e:
                     log.debug(f"Could not parse input_output: {e}")
@@ -107,6 +114,8 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
                 "label": "apps",
                 "source": "codeparrot/apps",
                 "test_code": test_code,
+                "entry_point": entry_point,
+                "language": "python",
             }
             return self._build_pair(
@@ -120,29 +129,82 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
             log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
             return None
-    def _build_test_code_from_io(self, io_data: dict) -> str:
+    @staticmethod
+    def _build_test_code_from_io(io_data: dict) -> tuple[str, str | None]:
         """Build test code from input/output data.
-        APPS solutions are script-style (stdin/stdout), not functions.
-        We use subprocess to run solution.py with the input.
+        APPS has two types of problems:
+        1. stdin/stdout: No fn_name, run via subprocess
+        2. call-based: Has fn_name, import and call Solution().fn_name()
+        Returns:
+            Tuple of (test_code, entry_point)
         """
         inputs = io_data.get("inputs", [])
         outputs = io_data.get("outputs", [])
-        if not inputs or not outputs:
-            return None
+        fn_name = io_data.get("fn_name")
-        # Build test code that runs solution.py as a subprocess
-        # Include normalize function to handle whitespace differences in APPS dataset
-        test_code = '''import subprocess
+        if not inputs or not outputs:
+            return None, None
+        if fn_name:
+            return AppsExtractor._build_call_based_test_code(inputs, outputs, fn_name)
+        else:
+            return AppsExtractor._build_stdin_test_code(inputs, outputs)
+    @staticmethod
+    def _build_call_based_test_code(
+        inputs: list, outputs: list, fn_name: str
+    ) -> tuple[str, None]:
+        """Build test code for call-based (LeetCode-style) problems."""
+        total = len(inputs)
+        test_code = f'''import sys
+from solution import Solution
+from typing import List, Optional, Dict, Tuple, Set, Any
+def compare_outputs(actual, expected):
+    """Compare outputs, handling floating point and nested structures."""
+    if isinstance(expected, float) and isinstance(actual, float):
+        return abs(actual - expected) < 1e-6
+    if isinstance(expected, list) and isinstance(actual, list):
+        if len(expected) != len(actual):
+            return False
+        return all(compare_outputs(a, e) for a, e in zip(actual, expected))
+    return actual == expected
+if __name__ == '__main__':
+    sol = Solution()
+    passed = 0
+    total = {total}
+'''
+        for i, (inp, out) in enumerate(zip(inputs, outputs)):
+            # inp is typically a list of arguments
+            if isinstance(inp, list):
+                args_repr = ", ".join(repr(arg) for arg in inp)
+            else:
+                args_repr = repr(inp)
+            test_code += f"    # Test case {i+1}\n"
+            test_code += f"    try:\n"
+            test_code += f"        result = sol.{fn_name}({args_repr})\n"
+            test_code += f"        expected = {repr(out)}\n"
+            test_code += f"        if compare_outputs(result, expected):\n"
+            test_code += f"            passed += 1\n"
+            test_code += f"    except Exception:\n"
+            test_code += f"        pass\n\n"
+        test_code += "    print(f'PASSED:{passed}/{total}')\n"
+        test_code += "    sys.exit(0 if passed == total else 1)\n"
+        return test_code, None
+    @staticmethod
+    def _build_stdin_test_code(inputs: list, outputs: list) -> tuple[str, None]:
+        """Build test code for stdin/stdout style problems."""
+        total = len(inputs)
+        test_code = f'''import subprocess
 import sys
 def normalize_output(s):
-    """Normalize output by stripping trailing whitespace from each line.
-    APPS dataset has inconsistent trailing whitespace in expected outputs.
-    This normalizes both actual and expected to enable fair comparison.
-    """
+    """Normalize output by stripping trailing whitespace from each line."""
     lines = s.split('\\n')
     normalized = '\\n'.join(line.rstrip() for line in lines)
     return normalized.strip()
@@ -157,26 +219,78 @@ def run_solution(input_str):
         timeout=10
     )
     if result.returncode != 0:
-        raise RuntimeError(f"Solution failed: {result.stderr}")
+        raise RuntimeError(f"Solution failed: {{result.stderr}}")
     return result.stdout
+if __name__ == '__main__':
+    passed = 0
+    total = {total}
 '''
-        test_code += "if __name__ == '__main__':\n"
         for i, (inp, out) in enumerate(zip(inputs, outputs)):
             test_code += f"    # Test case {i+1}\n"
-            test_code += f"    result = run_solution({repr(inp)})\n"
-            test_code += f"    expected = {repr(out)}\n"
-            test_code += f"    assert normalize_output(result) == normalize_output(expected), f'Test {i+1} failed: expected {{repr(expected)}}, got {{repr(result)}}'\n\n"
-        test_code += "    print('All tests passed!')\n"
-        return test_code
+            test_code += f"    try:\n"
+            test_code += f"        result = run_solution({repr(inp)})\n"
+            test_code += f"        expected = {repr(out)}\n"
+            test_code += f"        if normalize_output(result) == normalize_output(expected):\n"
+            test_code += f"            passed += 1\n"
+            test_code += f"    except Exception:\n"
+            test_code += f"        pass\n\n"
+        test_code += "    print(f'PASSED:{passed}/{total}')\n"
+        test_code += "    sys.exit(0 if passed == total else 1)\n"
+        return test_code, None
+    # Common imports for LeetCode-style solutions
+    COMMON_IMPORTS = """\
+from typing import List, Optional, Dict, Tuple, Set, Any
+import collections
+import heapq
+import itertools
+import functools
+import math
+import bisect
+from collections import defaultdict, Counter, deque
+"""
+    @staticmethod
+    def _prepend_imports(code: str) -> str:
+        """Prepend common imports to solution code.
+        APPS solutions assume LeetCode-style environment where
+        List, collections, heapq, etc. are pre-imported.
+        """
+        # Skip if code already has typing imports
+        if "from typing import" in code or "import typing" in code:
+            return code
+        return AppsExtractor.COMMON_IMPORTS + code
     def _create_incorrect_answer(self, correct: str) -> str:
-        """Create an incorrect answer by modifying the correct one."""
-        # For code, corrupt it slightly
-        if len(correct) > 10:
-            return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
-        return f"{correct} # INCORRECT"
+        """Create an incorrect answer by shuffling letters in words.
+        This reliably breaks code by corrupting variable/function names,
+        causing NameError or SyntaxError.
+        """
+        def shuffle_word(word: str) -> str:
+            """Shuffle all letters in a word."""
+            if len(word) <= 2:
+                return word
+            letters = list(word)
+            random.shuffle(letters)
+            shuffled = ''.join(letters)
+            if shuffled == word:
+                return word[::-1]  # Reverse if shuffle didn't change
+            return shuffled
+        def replace_word(match: re.Match) -> str:
+            word = match.group(0)
+            return shuffle_word(word)
+        # Shuffle words with 3+ characters
+        result = re.sub(r'[A-Za-z]{3,}', replace_word, correct)
+        # If nothing changed (all short words), append syntax error
+        if result == correct:
+            result = correct + "\n!!SYNTAX_ERROR!!"
+        return result

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py CHANGED Viewed

@@ -95,18 +95,8 @@ class CodeforcesExtractor(HuggingFaceBenchmarkExtractor):
             )
             log.info(f"Loaded {len(docs)} problems from Codeforces ({self.config})")
         except Exception as e:
-            log.warning(f"Failed to load open-r1/codeforces: {e}")
-            # Try alternative dataset
-            try:
-                docs = self.load_dataset(
-                    dataset_name="deepmind/code_contests",
-                    split="train",
-                    limit=max_items * 2 if max_items else None,
-                )
-                log.info(f"Loaded {len(docs)} problems from deepmind/code_contests")
-            except Exception as e2:
-                log.error(f"Failed to load any Codeforces dataset: {e2}")
-                return []
+            log.error(f"Failed to load open-r1/codeforces: {e}")
+            return []
         pairs: list[ContrastivePair] = []

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py CHANGED Viewed

@@ -20,7 +20,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
         - code: str (code answer/solution)
     """
-    evaluator_name = "generation"  # Text similarity for code-to-text tasks
+    evaluator_name = "generation"
     def extract_contrastive_pairs(
         self,
@@ -82,7 +82,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
             incorrect_answer = self._create_incorrect_answer(correct_answer)
             # Format the question
-            formatted_question = f"Question: {question}\n\nWhat is the answer?"
+            formatted_question = f"{question}\n\nGenerate code based on description:"
             metadata = {
                 "label": "codexglue",

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl