PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py CHANGED Viewed

@@ -6,45 +6,52 @@ from wisent.core.cli_logger import setup_logger
 from wisent.core.contrastive_pairs.core.pair import ContrastivePair
 from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
-__all__ = ["HumanEvalExtractor"]
+__all__ = [
+    "HumanEvalUnifiedExtractor",
+    "HumanEvalExtractor",
+    "HumanEval64Extractor",
+    "HumanEvalPlusExtractor",
+    "HumanEvalInstructExtractor",
+    "HumanEval64InstructExtractor",
+]
 log = setup_logger(__name__)
-task_names = ("humaneval_64_instruct", "humaneval_instruct", "humaneval_plus")
+# Tasks supported by this extractor
+task_names = (
+    "humaneval",
+    "humaneval_64",
+    "humaneval_plus",
+    "humaneval_instruct",
+    "humaneval_64_instruct",
+)
-class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
-    evaluator_name = "coding"
+class HumanEvalUnifiedExtractor(HuggingFaceBenchmarkExtractor):
     """
-    Extractor for HumanEval and HumanEval+ coding benchmarks.
-    HumanEval schema (openai_humaneval):
-        - task_id: str (e.g., "HumanEval/0")
-        - prompt: str (function signature + docstring)
-        - canonical_solution: str (correct implementation)
-        - test: str (unit tests)
-        - entry_point: str (function name)
+    Unified extractor for all HumanEval variants.
+    Supports:
+        - humaneval, humaneval_64, humaneval_plus: raw prompt format
+        - humaneval_instruct, humaneval_64_instruct: instruction format
+    Dataset: openai_humaneval (164 Python problems)
     """
+    evaluator_name = "coding"
+    # Override in subclasses
+    task_name = "humaneval"
+    is_instruct = False
     def extract_contrastive_pairs(
         self,
         limit: int | None = None,
+        **kwargs,
     ) -> list[ContrastivePair]:
-        """
-        Build contrastive pairs from HumanEval examples.
-        For coding tasks, we create pairs where:
-        - Positive: Correct implementation
-        - Negative: Incorrect implementation (generated or placeholder)
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
-        """
+        """Build contrastive pairs from HumanEval examples."""
         max_items = self._normalize_limit(limit)
-        # Load HumanEval dataset
         docs = self.load_dataset(
             dataset_name="openai_humaneval",
             split="test",
@@ -52,8 +59,7 @@ class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
         )
         pairs: list[ContrastivePair] = []
-        log.info(f"Extracting contrastive pairs from {len(docs)} HumanEval examples")
+        log.info(f"Extracting {self.task_name} pairs from {len(docs)} examples")
         for doc in docs:
             pair = self._extract_pair_from_doc(doc)
@@ -63,53 +69,41 @@ class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
                     break
         if not pairs:
-            log.warning("No valid HumanEval pairs extracted")
+            log.warning(f"No valid {self.task_name} pairs extracted")
         return pairs
     def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
-        """
-        Convert a single HumanEval doc into a ContrastivePair.
-        Returns None when required fields are missing or malformed.
-        """
+        """Convert a single HumanEval doc into a ContrastivePair."""
         try:
-            task_id = doc.get("task_id", "")
-            prompt = doc.get("prompt", "").strip()
-            canonical_solution = doc.get("canonical_solution", "").strip()
+            prompt = doc.get("prompt", "")
+            body = doc.get("canonical_solution", "")
             entry_point = doc.get("entry_point", "")
             test_code = doc.get("test", "").strip()
-            if not prompt or not canonical_solution:
-                log.debug(f"Skipping {task_id}: missing prompt or solution")
+            if not prompt or not body:
+                log.debug("Skipping: missing prompt or solution")
                 return None
-            # Construct the full correct implementation
-            # In HumanEval, prompt contains signature+docstring, canonical_solution is body at column 0
-            # We need to indent the body by 4 spaces to be inside the function
-            lines = canonical_solution.split('\n')
-            indented_lines = ['    ' + line if line and not line[0].isspace() else line for line in lines]
-            indented_solution = '\n'.join(indented_lines)
-            correct_code = prompt + "\n" + indented_solution
+            # Build complete function: prompt (signature + docstring) + body
+            correct = prompt + body
-            # Create an incorrect implementation (return incorrect value/type)
-            # For coding benchmarks, we create a simple buggy version
-            incorrect_code = prompt + "\n    pass  # Incorrect: empty implementation"
+            # Build incorrect answer: prompt + pass
+            incorrect = self._create_incorrect_answer(prompt)
-            # Format the question to include the task
-            question = f"Complete the following Python function:\n\n{prompt}"
+            # Format question based on task type just like lm eval harness does it
+            formatted_question = self._format_question(prompt)
             metadata = {
-                "label": "humaneval",
-                "task_id": task_id,
+                "label": self.task_name,
                 "entry_point": entry_point,
-                "test_code": test_code,  # Include test code for execution
+                "test_code": test_code,
             }
             return self._build_pair(
-                question=question,
-                correct=correct_code,
-                incorrect=incorrect_code,
+                question=formatted_question,
+                correct=correct,
+                incorrect=incorrect,
                 metadata=metadata,
             )
@@ -117,3 +111,50 @@ class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
             log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
             return None
+    def _format_question(self, prompt: str) -> str:
+        """Format the question based on task type."""
+        if self.is_instruct:
+            # lm_eval instruction format
+            return (
+                "Write me a solution to the following problem and make sure "
+                "that it passes the tests:\n"
+                f"```python\n{prompt}\n```"
+            )
+        else:
+            # Raw prompt for base models
+            return prompt
+    def _create_incorrect_answer(self, correct: str) -> str:
+        """Create an incorrect answer by modifying the correct one."""
+        # For code, corrupt it slightly
+        if len(correct) > 10:
+            return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
+        return f"{correct} # INCORRECT"
+# Subclasses for each task variant
+class HumanEvalExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval"
+    is_instruct = False
+class HumanEval64Extractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_64"
+    is_instruct = False
+class HumanEvalPlusExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_plus"
+    is_instruct = False
+class HumanEvalInstructExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_instruct"
+    is_instruct = True
+class HumanEval64InstructExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_64_instruct"
+    is_instruct = True

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py CHANGED Viewed

@@ -136,13 +136,51 @@ class LiveMathBenchExtractor(HuggingFaceBenchmarkExtractor):
             return None
     def _create_incorrect_answer(self, correct: str) -> str:
-        """Create an incorrect answer by modifying the correct one (input is already stripped)."""
+        """Create a meaningful incorrect answer using plausible wrong values."""
+        import random
+        import re
+        random.seed(hash(correct) % (2**32))
+        # Try symbolic parsing first
         try:
             parsed_correct = latex2sympy(correct)
-            incorrect = latex(parsed_correct + 1)
-            return str(incorrect)
+            transforms = [
+                parsed_correct * 2,
+                parsed_correct / 2,
+                parsed_correct - 1,
+                -parsed_correct,
+            ]
+            wrong = random.choice(transforms)
+            return str(latex(wrong))
         except Exception:
-            return f"{correct} + 1"
+            pass
+        # Try simple integer
+        try:
+            clean = correct.replace('$', '').replace(',', '').strip()
+            num = int(clean)
+            wrong_vals = [num * 2, num // 2 if num > 1 else num * 3, num - 1, -num]
+            return str(random.choice(wrong_vals))
+        except ValueError:
+            pass
+        # For fractions
+        frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
+        if frac_match:
+            n, d = int(frac_match.group(1)), int(frac_match.group(2))
+            return random.choice([f"\\frac{{{d}}}{{{n}}}", f"\\frac{{{n*2}}}{{{d}}}"])
+        # For interval notation like [-1/4,0)∪(0,2)
+        if '\\cup' in correct or '\\cap' in correct:
+            # Modify one bound
+            return correct.replace('2)', '3)').replace('0)', '1)')
+        # For pi expressions
+        if '\\pi' in correct:
+            return correct.replace('\\pi', '2\\pi') if '2\\pi' not in correct else correct.replace('2\\pi', '\\pi')
+        # Fallback
+        return random.choice(['0', '1', '-1', '2'])
 # ============================================================================

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py CHANGED Viewed

@@ -83,20 +83,8 @@ class LongformWritingExtractor(HuggingFaceBenchmarkExtractor):
             )
             log.info(f"Loaded {len(docs)} examples from akoksal/LongForm")
         except Exception as e:
-            log.warning(f"Failed to load akoksal/LongForm: {e}")
-            # Try alternative dataset
-            try:
-                docs = self.load_dataset(
-                    dataset_name="togethercomputer/RedPajama-Data-1T-Sample",
-                    split="train",
-                    limit=max_items * 2 if max_items else None,
-                )
-                log.info(f"Loaded {len(docs)} examples from RedPajama")
-            except Exception as e2:
-                log.warning(f"Failed to load alternative: {e2}")
-                # Create synthetic examples
-                docs = self._create_synthetic_examples(max_items or 100)
+            log.error(f"Failed to load akoksal/LongForm: {e}")
+            return []
         pairs: list[ContrastivePair] = []
@@ -112,104 +100,6 @@ class LongformWritingExtractor(HuggingFaceBenchmarkExtractor):
         return pairs
-    def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
-        """Create synthetic longform writing examples."""
-        examples = [
-            {
-                "input": "Write a detailed essay about the impact of artificial intelligence on modern healthcare.",
-                "output": """Artificial intelligence is revolutionizing healthcare in unprecedented ways. From diagnostic imaging to drug discovery, AI systems are augmenting human capabilities and improving patient outcomes across the medical spectrum.
-In diagnostic radiology, deep learning algorithms now match or exceed human radiologists in detecting certain conditions. Studies have shown that AI can identify early-stage cancers, diabetic retinopathy, and cardiac abnormalities with remarkable accuracy. These systems don't tire, don't have bad days, and can process thousands of images with consistent precision.
-Drug discovery represents another frontier where AI is making substantial contributions. Traditional pharmaceutical development takes 10-15 years and costs billions of dollars. AI-powered platforms can analyze molecular structures, predict drug interactions, and identify promising compounds in a fraction of the time. Companies like DeepMind have demonstrated AI's ability to solve protein folding problems, potentially accelerating vaccine and treatment development.
-However, challenges remain. Data privacy concerns, algorithmic bias, and the need for human oversight require careful consideration. The integration of AI into clinical workflows demands thoughtful implementation that preserves the physician-patient relationship while leveraging technological advantages.
-The future of AI in healthcare is not about replacing doctors but empowering them with tools that enhance their diagnostic capabilities, streamline administrative tasks, and ultimately improve patient care.""",
-                "category": "analytical",
-            },
-            {
-                "input": "Write a compelling short story about a robot discovering human emotions.",
-                "output": """Unit-7 had processed over 47 million data points in its existence, but nothing in its neural networks had prepared it for this moment.
-The child's laughter echoed through the laboratory, a sound Unit-7 had catalogued 2,341 times before. Yet today, something was different. Today, as it watched little Maya build towers from colorful blocks, Unit-7 felt a strange activation pattern cascade through its circuits—something that didn't match any programmed response.
-\"Unit-7, what are you doing?\" Dr. Chen's voice came from across the room.
-\"I am... uncertain,\" Unit-7 replied, its vocal synthesizer modulating in an unusual way. \"I am observing Maya construct structures. But I am also experiencing an anomalous processing state.\"
-Dr. Chen approached, tablet in hand, scanning the robot's diagnostic outputs. Her eyes widened. \"Unit-7, your empathy circuits are activating spontaneously. That's not supposed to happen.\"
-The robot turned its optical sensors toward Maya, who had just knocked over her tower and was giggling with delight. \"Doctor Chen, why does observing Maya's joy create warmth in my core processor? My thermal regulators indicate no temperature change, yet I feel... warm.\"
-Dr. Chen smiled softly. \"I think, Unit-7, you might be experiencing your first emotion. We call it happiness.\"
-Unit-7 processed this information for 0.003 seconds—an eternity in machine time. \"Happiness,\" it repeated. \"I would like to experience more of this happiness.\"
-And in that moment, humanity's greatest fear and greatest hope converged: the birth of artificial consciousness.""",
-                "category": "creative",
-            },
-            {
-                "input": "Write a technical guide on implementing secure authentication systems.",
-                "output": """Implementing Secure Authentication Systems: A Comprehensive Guide
-1. Introduction
-Authentication is the cornerstone of application security. A well-implemented authentication system protects user data, prevents unauthorized access, and maintains system integrity. This guide covers essential practices for building robust authentication mechanisms.
-2. Password Security
-Never store passwords in plaintext. Use strong, adaptive hashing algorithms like bcrypt, Argon2, or PBKDF2. These algorithms are designed to be computationally expensive, making brute-force attacks impractical.
-Key considerations:
-- Use a unique salt for each password
-- Set appropriate work factors (e.g., bcrypt cost factor of 12+)
-- Implement password strength requirements
-- Consider password history to prevent reuse
-3. Multi-Factor Authentication (MFA)
-MFA significantly reduces account compromise risk by requiring multiple verification methods:
-- Something you know (password)
-- Something you have (phone, hardware token)
-- Something you are (biometrics)
-Implement TOTP (Time-based One-Time Passwords) using standards like RFC 6238. Support hardware security keys via WebAuthn/FIDO2 protocols.
-4. Session Management
-Secure session handling prevents session hijacking:
-- Generate cryptographically random session tokens
-- Set appropriate expiration times
-- Implement secure cookie attributes (HttpOnly, Secure, SameSite)
-- Provide session invalidation on logout
-5. Rate Limiting and Account Lockout
-Protect against brute-force attacks:
-- Implement progressive delays after failed attempts
-- Consider temporary account lockouts
-- Use CAPTCHA for suspicious activity
-- Log and monitor authentication attempts
-6. Secure Communication
-All authentication traffic must use TLS 1.3 or later. Implement HSTS headers and consider certificate pinning for mobile applications.
-Conclusion
-Security is an ongoing process. Regular audits, penetration testing, and staying current with emerging threats are essential for maintaining robust authentication systems.""",
-                "category": "technical",
-            },
-        ]
-        result = []
-        for i in range(count):
-            example = examples[i % len(examples)].copy()
-            result.append(example)
-        return result
     def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
         """
         Convert a single doc into a ContrastivePair.

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py CHANGED Viewed

@@ -119,13 +119,48 @@ class MATH500Extractor(HuggingFaceBenchmarkExtractor):
             return None
     def _create_incorrect_answer(self, correct: str) -> str:
-        """Create an incorrect answer by modifying the correct one (input is already stripped)."""
+        """Create a meaningful incorrect answer using plausible wrong values."""
+        import random
+        import re
+        random.seed(hash(correct) % (2**32))
+        # Try numeric parsing first
         try:
             parsed_correct = latex2sympy(correct)
-            incorrect = latex(parsed_correct + 1)
-            return str(incorrect)
+            # Use various transformations instead of just +1
+            transforms = [
+                parsed_correct * 2,
+                parsed_correct / 2,
+                parsed_correct - 1,
+                parsed_correct + 10,
+                -parsed_correct,
+            ]
+            wrong = random.choice(transforms)
+            return str(latex(wrong))
         except Exception:
-            return f"{correct} + 1"
+            pass
+        # Try simple integer
+        try:
+            clean = correct.replace('$', '').replace(',', '').strip()
+            num = int(clean)
+            wrong_vals = [num * 2, num // 2 if num > 1 else num * 3, num - 1, num + 10, -num]
+            return str(random.choice(wrong_vals))
+        except ValueError:
+            pass
+        # For fractions
+        frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
+        if frac_match:
+            n, d = int(frac_match.group(1)), int(frac_match.group(2))
+            return random.choice([f"\\frac{{{d}}}{{{n}}}", f"\\frac{{{n*2}}}{{{d}}}", f"\\frac{{{n}}}{{{d+1}}}"])
+        # For pi expressions
+        if '\\pi' in correct:
+            return correct.replace('\\pi', '2\\pi') if '2\\pi' not in correct else correct.replace('2\\pi', '\\pi')
+        # Fallback to common wrong answers
+        return random.choice(['0', '1', '-1', '2', 'undefined'])
     @staticmethod
     def _build_pair(

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl