PyPI - wisent - Versions diffs - 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl - Mend

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (391) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import requests
 from typing import Any
 from wisent.core.cli_logger import setup_logger
@@ -10,45 +11,32 @@ __all__ = ["AiderPolyglotExtractor"]
 log = setup_logger(__name__)
+# GitHub API base URL for Aider Polyglot benchmark
+AIDER_GITHUB_API = "https://api.github.com/repos/Aider-AI/polyglot-benchmark/contents"
 # Languages supported by Aider Polyglot benchmark
-AIDER_POLYGLOT_LANGUAGES = [
-    "python",
-    "javascript",
-    "java",
-    "cpp",
-    "go",
-    "rust",
-]
+AIDER_POLYGLOT_LANGUAGES = ["python", "javascript", "java", "cpp", "go", "rust"]
 class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
     """
-    Extractor for Aider Polyglot-style code editing benchmarks.
+    Extractor for Aider Polyglot benchmark.
+    GitHub: https://github.com/Aider-AI/polyglot-benchmark
     Aider's polyglot benchmark tests LLMs on 225 challenging Exercism coding
-    exercises across C++, Go, Java, JavaScript, Python, and Rust. This extractor
-    uses the jinaai/code_exercises dataset which provides similar code exercise
-    problems in Python.
+    exercises across C++, Go, Java, JavaScript, Python, and Rust.
-    The benchmark evaluates:
-    - Code generation from docstrings
-    - Code editing and completion
-    - Multi-turn correction (fixing failed attempts)
+    Structure per exercise:
+    - .docs/instructions.md - problem description
+    - .meta/example.py - reference solution
+    - {name}_test.py - test cases
     For code editing:
-    - Positive (correct) = Working solution that passes tests
+    - Positive (correct) = Working solution from .meta/example.py
     - Negative (incorrect) = Buggy or incomplete solution
-    Schema (jinaai/code_exercises):
-        - problem: str (function signature with docstring)
-        - solution: str (complete solution implementation)
-    Note: The original Aider Polyglot benchmark is hosted on GitHub at
-    github.com/Aider-AI/polyglot-benchmark. This extractor uses HuggingFace
-    alternatives with similar structure.
     """
-    # Evaluator that should be used for this benchmark
     evaluator_name = "code_editing"
     def __init__(self, language: str = "python"):
@@ -56,9 +44,11 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
         Initialize Aider Polyglot extractor.
         Args:
-            language: Target programming language (currently python supported)
+            language: Target programming language (python, javascript, java, cpp, go, rust)
         """
         super().__init__()
+        if language not in AIDER_POLYGLOT_LANGUAGES:
+            raise ValueError(f"Language must be one of {AIDER_POLYGLOT_LANGUAGES}")
         self.language = language
     def extract_contrastive_pairs(
@@ -66,48 +56,21 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
         limit: int | None = None,
     ) -> list[ContrastivePair]:
         """
-        Build contrastive pairs from code exercise examples.
-        For code editing:
-        - Positive (correct) = Working solution
-        - Negative (incorrect) = Buggy or incomplete solution
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
+        Build contrastive pairs from Aider Polyglot GitHub repository.
         """
         max_items = self._normalize_limit(limit)
+        pairs: list[ContrastivePair] = []
-        # Try primary dataset
-        try:
-            docs = self.load_dataset(
-                dataset_name="jinaai/code_exercises",
-                split="train",
-                limit=max_items,
-            )
-            dataset_source = "jinaai/code_exercises"
-            log.info(f"Loaded {len(docs)} examples from {dataset_source}")
-        except Exception as e:
-            log.warning(f"Failed to load jinaai/code_exercises: {e}")
-            # Try alternative dataset
-            try:
-                docs = self.load_dataset(
-                    dataset_name="synapse-alpha/coding_exercises",
-                    split="train",
-                    limit=max_items,
-                )
-                dataset_source = "synapse-alpha/coding_exercises"
-                log.info(f"Loaded {len(docs)} examples from {dataset_source}")
-            except Exception as e2:
-                log.error(f"Failed to load any code exercises dataset: {e2}")
-                return []
+        exercises = self._load_exercises_from_github()
+        if not exercises:
+            log.error("Failed to load exercises from Aider Polyglot GitHub")
+            return []
-        pairs: list[ContrastivePair] = []
+        log.info(f"Loaded {len(exercises)} exercises from Aider Polyglot GitHub")
-        for doc in docs:
-            pair = self._extract_pair_from_doc(doc, dataset_source)
+        for exercise in exercises:
+            pair = self._extract_pair_from_exercise(exercise)
             if pair is not None:
                 pairs.append(pair)
                 if max_items is not None and len(pairs) >= max_items:
@@ -118,36 +81,94 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
         return pairs
-    def _extract_pair_from_doc(
-        self,
-        doc: dict[str, Any],
-        source: str,
-    ) -> ContrastivePair | None:
-        """
-        Convert a single doc into a ContrastivePair.
+    def _load_exercises_from_github(self) -> list[dict[str, Any]]:
+        """Load exercises from Aider Polyglot GitHub repository."""
+        try:
+            # Get list of exercises
+            exercises_url = f"{AIDER_GITHUB_API}/{self.language}/exercises/practice"
+            response = requests.get(exercises_url, timeout=30)
+            response.raise_for_status()
+            exercise_dirs = response.json()
+            exercises = []
+            for exercise_dir in exercise_dirs:
+                if exercise_dir.get("type") != "dir":
+                    continue
+                exercise_name = exercise_dir.get("name", "")
+                exercise_path = exercise_dir.get("path", "")
+                # Load instructions and solution
+                exercise_data = self._load_exercise_data(exercise_name, exercise_path)
+                if exercise_data:
+                    exercises.append(exercise_data)
+            return exercises
+        except Exception as e:
+            log.error(f"Failed to load exercises from GitHub: {e}")
+            return []
-        Returns None when required fields are missing or malformed.
-        """
+    def _load_exercise_data(self, name: str, path: str) -> dict[str, Any] | None:
+        """Load a single exercise's instructions and solution."""
         try:
-            problem = doc.get("problem", "").strip()
-            solution = doc.get("solution", "").strip()
+            base_url = "https://raw.githubusercontent.com/Aider-AI/polyglot-benchmark/main"
+            # Load instructions
+            instructions_url = f"{base_url}/{path}/.docs/instructions.md"
+            instructions_resp = requests.get(instructions_url, timeout=15)
+            if instructions_resp.status_code != 200:
+                return None
+            instructions = instructions_resp.text
+            # Load solution - file extension depends on language
+            ext_map = {
+                "python": "py", "javascript": "js", "java": "java",
+                "cpp": "cpp", "go": "go", "rust": "rs"
+            }
+            ext = ext_map.get(self.language, "py")
+            solution_url = f"{base_url}/{path}/.meta/example.{ext}"
+            solution_resp = requests.get(solution_url, timeout=15)
+            if solution_resp.status_code != 200:
+                return None
+            solution = solution_resp.text
+            return {
+                "name": name,
+                "instructions": instructions,
+                "solution": solution,
+                "path": path,
+            }
+        except Exception as e:
+            log.debug(f"Failed to load exercise {name}: {e}")
+            return None
-            if not problem or not solution:
-                log.debug("Skipping: missing problem or solution")
+    def _extract_pair_from_exercise(self, exercise: dict[str, Any]) -> ContrastivePair | None:
+        """Convert an exercise into a ContrastivePair."""
+        try:
+            name = exercise.get("name", "")
+            instructions = exercise.get("instructions", "").strip()
+            solution = exercise.get("solution", "").strip()
+            if not instructions or not solution:
                 return None
-            # Build the prompt
-            prompt = self._build_prompt(problem)
+            prompt = f"""Coding Exercise: {name.replace('-', ' ').title()}
-            # Correct response is the working solution
-            correct_response = self._create_correct_response(solution)
+{instructions}
+Please provide the complete implementation."""
-            # Incorrect response is a buggy version
-            incorrect_response = self._create_incorrect_response(problem, solution)
+            correct_response = f"```{self.language}\n{solution}\n```"
+            incorrect_response = self._create_incorrect_response(solution)
             metadata = {
                 "label": "aider_polyglot",
-                "source": source,
+                "source": "Aider-AI/polyglot-benchmark",
+                "exercise_name": name,
                 "language": self.language,
                 "is_code_benchmark": True,
                 "is_code_editing_benchmark": True,
@@ -161,65 +182,21 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
             )
         except Exception as exc:
-            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
+            log.error(f"Error extracting pair: {exc}", exc_info=True)
             return None
-    def _build_prompt(self, problem: str) -> str:
-        """Build the code editing prompt."""
-        return f"""Complete the following Python function based on its docstring.
-{problem}
-Please provide the complete implementation."""
-    def _create_correct_response(self, solution: str) -> str:
-        """Create the correct response with working solution."""
-        return f"""Here is the complete implementation:
-```python
-{solution}
-```
-This solution correctly implements the function according to the docstring specification."""
-    def _create_incorrect_response(self, problem: str, solution: str) -> str:
+    def _create_incorrect_response(self, solution: str) -> str:
         """Create an incorrect response with common bugs."""
-        # Extract function name from problem if possible
-        func_name = "the function"
-        if "def " in problem:
-            try:
-                func_part = problem.split("def ")[1]
-                func_name = func_part.split("(")[0]
-            except (IndexError, AttributeError):
-                pass
-        # Create a buggy version by introducing common errors
-        buggy_solution = self._introduce_bugs(solution)
-        return f"""Here is my implementation:
-```python
-{buggy_solution}
-```
-Note: This implementation may have issues:
-- Missing edge case handling
-- Potential off-by-one errors
-- Incomplete logic"""
-    def _introduce_bugs(self, solution: str) -> str:
-        """Introduce common bugs into a solution."""
         lines = solution.split("\n")
         if len(lines) > 3:
-            # Remove a line to create incomplete logic
             middle_idx = len(lines) // 2
-            buggy_lines = lines[:middle_idx] + ["    pass  # TODO: complete implementation"] + lines[middle_idx+2:]
-            return "\n".join(buggy_lines)
+            buggy_lines = lines[:middle_idx] + ["    pass  # TODO: incomplete"] + lines[middle_idx+2:]
+            buggy = "\n".join(buggy_lines)
         elif lines:
-            # For short solutions, replace with pass
-            first_line = lines[0] if lines else "def func():"
-            return f"{first_line}\n    pass  # Implementation incomplete"
+            buggy = f"{lines[0]}\n    pass  # Implementation incomplete"
         else:
-            return "pass  # No implementation"
+            buggy = "pass  # No implementation"
+        return f"```{self.language}\n{buggy}\n```"

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py CHANGED Viewed

@@ -95,18 +95,8 @@ class CodeforcesExtractor(HuggingFaceBenchmarkExtractor):
             )
             log.info(f"Loaded {len(docs)} problems from Codeforces ({self.config})")
         except Exception as e:
-            log.warning(f"Failed to load open-r1/codeforces: {e}")
-            # Try alternative dataset
-            try:
-                docs = self.load_dataset(
-                    dataset_name="deepmind/code_contests",
-                    split="train",
-                    limit=max_items * 2 if max_items else None,
-                )
-                log.info(f"Loaded {len(docs)} problems from deepmind/code_contests")
-            except Exception as e2:
-                log.error(f"Failed to load any Codeforces dataset: {e2}")
-                return []
+            log.error(f"Failed to load open-r1/codeforces: {e}")
+            return []
         pairs: list[ContrastivePair] = []

wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl