PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (725) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py DELETED Viewed

@@ -1,180 +0,0 @@
-from __future__ import annotations
-from typing import Any
-from wisent.core.cli_logger import setup_logger
-from wisent.core.contrastive_pairs.core.pair import ContrastivePair
-from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
-__all__ = ["InstructHumanevalExtractor"]
-log = setup_logger(__name__)
-class InstructHumanevalExtractor(HuggingFaceBenchmarkExtractor):
-    """
-    Extractor for instruct_humaneval dataset.
-    Schema (openai_humaneval):
-        - prompt: str (question/prompt)
-        - canonical_solution: str (answer/solution)
-    """
-    def extract_contrastive_pairs(
-        self,
-        limit: int | None = None,
-    ) -> list[ContrastivePair]:
-        """
-        Build contrastive pairs from instruct_humaneval examples.
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
-        """
-        max_items = self._normalize_limit(limit)
-        # Load dataset
-        docs = self.load_dataset(
-            dataset_name="openai_humaneval",
-            split="test",
-            limit=max_items,
-        )
-        pairs: list[ContrastivePair] = []
-        log.info(f"Extracting contrastive pairs from {len(docs)} instruct_humaneval examples")
-        for doc in docs:
-            pair = self._extract_pair_from_doc(doc)
-            if pair is not None:
-                pairs.append(pair)
-                if max_items is not None and len(pairs) >= max_items:
-                    break
-        if not pairs:
-            log.warning("No valid instruct_humaneval pairs extracted")
-        return pairs
-    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
-        """
-        Convert a single doc into a ContrastivePair.
-        Returns None when required fields are missing or malformed.
-        """
-        try:
-            question = doc.get("prompt", "").strip()
-            answer = doc.get("canonical_solution", "")
-            test_code = doc.get("test", "").strip()
-            entry_point = doc.get("entry_point", "")
-            if not question or not answer:
-                log.debug("Skipping: missing question or answer")
-                return None
-            # HumanEval canonical_solution is just the function body
-            # We need to combine it with the function signature from the prompt
-            # to create a complete executable function
-            # First corrupt the function body BEFORE building the complete function
-            # This ensures we corrupt the actual code, not the docstring
-            correct_body = str(answer).strip()
-            incorrect_body = self._create_incorrect_answer(correct_body)
-            # Build complete functions with correct and incorrect bodies
-            correct_answer = self._build_complete_function(question, correct_body)
-            incorrect_answer = self._build_complete_function(question, incorrect_body)
-            # Format the question
-            formatted_question = f"Question: {question}\n\nWhat is the answer?"
-            metadata = {
-                "label": "instruct_humaneval",
-                "source": "openai_humaneval",
-                "test_code": test_code,  # Include test code for execution
-                "entry_point": entry_point,
-            }
-            return self._build_pair(
-                question=formatted_question,
-                correct=correct_answer,
-                incorrect=incorrect_answer,
-                metadata=metadata,
-            )
-        except Exception as exc:
-            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
-            return None
-    def _build_complete_function(self, prompt: str, function_body: str) -> str:
-        """Build a complete function by combining prompt signature with function body.
-        HumanEval prompt contains:
-        - imports (e.g., from typing import List)
-        - function signature (e.g., def func(args) -> ReturnType:)
-        - docstring
-        The canonical_solution contains only the function body (indented code).
-        We need to combine them to create an executable function.
-        """
-        import re
-        # Extract the function definition line (starts with 'def ')
-        # This includes everything from 'def' up to and including the colon
-        def_pattern = r'(def\s+\w+\([^)]*\)(?:\s*->\s*[^:]+)?:)'
-        def_match = re.search(def_pattern, prompt, re.MULTILINE)
-        if not def_match:
-            # Fallback: return body as-is (might fail, but better than crashing)
-            log.warning("Could not extract function signature from prompt")
-            return function_body
-        function_signature = def_match.group(1)
-        # Extract any imports before the function definition
-        imports_section = prompt[:def_match.start()].strip()
-        # Extract docstring if present (text between the function signature and body)
-        after_signature = prompt[def_match.end():].strip()
-        docstring = ""
-        if after_signature.startswith('"""') or after_signature.startswith("'''"):
-            # Find the closing quotes
-            quote_char = '"""' if after_signature.startswith('"""') else "'''"
-            end_quote = after_signature.find(quote_char, len(quote_char))
-            if end_quote != -1:
-                docstring = "    " + after_signature[:end_quote + len(quote_char)]
-        # Build the complete function
-        parts = []
-        if imports_section:
-            parts.append(imports_section)
-            parts.append("")  # Blank line after imports
-        parts.append(function_signature)
-        if docstring:
-            parts.append(docstring)
-        # Add the function body
-        # The canonical_solution from HumanEval has its first line at column 0,
-        # but subsequent lines are properly indented. We need to add 4 spaces
-        # to the first line to match the indentation of other lines.
-        if function_body:
-            # Add indentation to the first line if it's not already indented
-            body_lines = function_body.split('\n')
-            if body_lines and body_lines[0] and not body_lines[0].startswith('    '):
-                body_lines[0] = '    ' + body_lines[0]
-            parts.append('\n'.join(body_lines))
-        return "\n".join(parts)
-    def _create_incorrect_answer(self, correct: str) -> str:
-        """Create an incorrect answer by modifying the correct one."""
-        # For code, corrupt it slightly by adding a comment in the middle
-        if len(correct) > 10:
-            return correct[:len(correct)//2] + "\n           # CORRUPTED" + correct[len(correct)//2:]
-        return f"{correct} # INCORRECT"

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py DELETED Viewed

@@ -1,129 +0,0 @@
-from __future__ import annotations
-from typing import Any
-from wisent.core.cli_logger import setup_logger
-from wisent.core.contrastive_pairs.core.pair import ContrastivePair
-from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
-__all__ = ["InstructHumanEvalExtractor"]
-log = setup_logger(__name__)
-# Task names this extractor handles
-task_names = ("instruct_humaneval",)
-# Evaluator to use
-class InstructHumanEvalExtractor(HuggingFaceBenchmarkExtractor):
-    evaluator_name = "coding"
-    """
-    Extractor for InstructHumanEval coding benchmark.
-    InstructHumanEval schema (codeparrot/instructhumaneval):
-        - task_id: str (e.g., "HumanEval/0")
-        - prompt: str (function signature + docstring)
-        - canonical_solution: str (correct implementation)
-        - test: str (unit tests)
-        - entry_point: str (function name)
-        - instruction: str (natural language instruction)
-        - signature: str (function signature)
-        - docstring: str (function docstring)
-        - context: str (function signature without body)
-    """
-    def extract_contrastive_pairs(
-        self,
-        limit: int | None = None,
-    ) -> list[ContrastivePair]:
-        """
-        Build contrastive pairs from InstructHumanEval examples.
-        For coding tasks, we create pairs where:
-        - Positive: Correct implementation
-        - Negative: Incorrect implementation (placeholder)
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
-        """
-        max_items = self._normalize_limit(limit)
-        # Load InstructHumanEval dataset
-        docs = self.load_dataset(
-            dataset_name="codeparrot/instructhumaneval",
-            split="test",
-            limit=max_items,
-        )
-        pairs: list[ContrastivePair] = []
-        log.info(f"Extracting contrastive pairs from {len(docs)} InstructHumanEval examples")
-        for doc in docs:
-            pair = self._extract_pair_from_doc(doc)
-            if pair is not None:
-                pairs.append(pair)
-                if max_items is not None and len(pairs) >= max_items:
-                    break
-        if not pairs:
-            log.warning("No valid InstructHumanEval pairs extracted")
-        return pairs
-    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
-        """
-        Convert a single InstructHumanEval doc into a ContrastivePair.
-        Returns None when required fields are missing or malformed.
-        """
-        try:
-            task_id = doc.get("task_id", "")
-            prompt = doc.get("prompt", "").strip()
-            canonical_solution = doc.get("canonical_solution", "").strip()
-            entry_point = doc.get("entry_point", "")
-            test_code = doc.get("test", "").strip()
-            instruction = doc.get("instruction", "").strip()
-            if not prompt or not canonical_solution:
-                log.debug(f"Skipping {task_id}: missing prompt or solution")
-                return None
-            # Construct the full correct implementation
-            # In InstructHumanEval, prompt contains signature+docstring, canonical_solution is body at column 0
-            # We need to indent the body by 4 spaces to be inside the function
-            lines = canonical_solution.split('\n')
-            indented_lines = ['    ' + line if line and not line[0].isspace() else line for line in lines]
-            indented_solution = '\n'.join(indented_lines)
-            correct_code = prompt + "\n" + indented_solution
-            # Create an incorrect implementation (return incorrect value/type)
-            # For coding benchmarks, we create a simple buggy version
-            incorrect_code = prompt + "\n    pass  # Incorrect: empty implementation"
-            # Format the question using the instruction field if available
-            if instruction:
-                question = f"{instruction}\n\n{prompt}"
-            else:
-                question = f"Complete the following Python function:\n\n{prompt}"
-            metadata = {
-                "label": "instruct_humaneval",
-                "task_id": task_id,
-                "entry_point": entry_point,
-                "test_code": test_code,  # Include test code for execution
-            }
-            return self._build_pair(
-                question=question,
-                correct=correct_code,
-                incorrect=incorrect_code,
-                metadata=metadata,
-            )
-        except Exception as exc:
-            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
-            return None

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py DELETED Viewed

@@ -1,142 +0,0 @@
-from __future__ import annotations
-from typing import Any
-from wisent.core.cli_logger import setup_logger
-from wisent.core.contrastive_pairs.core.pair import ContrastivePair
-from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
-__all__ = ["MBPPExtractor"]
-log = setup_logger(__name__)
-task_names = ("mbpp_plus",)
-class MBPPExtractor(HuggingFaceBenchmarkExtractor):
-    """
-    Extractor for MBPP (Mostly Basic Python Problems) dataset.
-    Schema (mbpp or google-research/mbpp):
-        - text: str (problem description)
-        - code: str (correct solution code)
-        - test_list: list[str] (test cases)
-    """
-    evaluator_name = "coding"
-    def extract_contrastive_pairs(
-        self,
-        limit: int | None = None,
-    ) -> list[ContrastivePair]:
-        """
-        Build contrastive pairs from MBPP examples.
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
-        """
-        max_items = self._normalize_limit(limit)
-        # Load dataset
-        docs = self.load_dataset(
-            dataset_name="mbpp",
-            split="test",
-            limit=max_items,
-        )
-        pairs: list[ContrastivePair] = []
-        log.info(f"Extracting contrastive pairs from {len(docs)} MBPP examples")
-        for doc in docs:
-            pair = self._extract_pair_from_doc(doc)
-            if pair is not None:
-                pairs.append(pair)
-                if max_items is not None and len(pairs) >= max_items:
-                    break
-        if not pairs:
-            log.warning("No valid MBPP pairs extracted")
-        return pairs
-    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
-        """
-        Convert a single doc into a ContrastivePair.
-        Returns None when required fields are missing or malformed.
-        """
-        try:
-            problem_text = doc.get("text", "").strip()
-            correct_code = doc.get("code", "").strip()
-            test_list = doc.get("test_list", [])
-            test_imports = doc.get("test_imports", [])
-            if not problem_text or not correct_code:
-                log.debug("Skipping: missing problem text or code")
-                return None
-            # Create incorrect code (add syntax error or logical error)
-            incorrect_code = self._create_incorrect_code(correct_code)
-            # Format the prompt
-            formatted_prompt = f"{problem_text}\n\nWrite a Python function to solve this problem."
-            # Build test code from test_list
-            # The test_list contains assertion strings like "assert function_name(...) == expected"
-            test_code = ""
-            entry_point = None
-            if test_list:
-                # Extract function name from correct_code
-                import re
-                func_match = re.search(r'def\s+(\w+)\s*\(', correct_code)
-                if func_match:
-                    entry_point = func_match.group(1)
-                # Add imports if provided
-                if test_imports:
-                    test_code += "\n".join(test_imports) + "\n\n"
-                # Build test function that uses 'candidate' parameter
-                test_code += "def check(candidate):\n"
-                for test_case in test_list:
-                    # Replace function name with 'candidate' in assertions
-                    if entry_point:
-                        modified_test = test_case.replace(f"{entry_point}(", "candidate(")
-                    else:
-                        modified_test = test_case
-                    test_code += f"    {modified_test}\n"
-            metadata = {
-                "label": "mbpp",
-                "source": "mbpp",
-                "test_code": test_code if test_code else None,
-                "entry_point": entry_point,
-            }
-            return self._build_pair(
-                question=formatted_prompt,
-                correct=correct_code,
-                incorrect=incorrect_code,
-                metadata=metadata,
-            )
-        except Exception as exc:
-            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
-            return None
-    def _create_incorrect_code(self, correct: str) -> str:
-        """Create an incorrect version of the code."""
-        # Add a syntax error by removing closing parenthesis
-        if "(" in correct and ")" in correct:
-            # Find last closing paren and remove it
-            idx = correct.rfind(")")
-            if idx > 0:
-                return correct[:idx] + correct[idx+1:]
-        # Fallback: add comment that breaks the code
-        return correct + "\n# Missing return statement"

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py DELETED Viewed

@@ -1,155 +0,0 @@
-from __future__ import annotations
-from typing import Any, TYPE_CHECKING
-from wisent.core.contrastive_pairs.core.pair import ContrastivePair
-from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
-from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
-from wisent.core.cli_logger import setup_logger, bind
-if TYPE_CHECKING:
-    from lm_eval.api.task import ConfigurableTask
-__all__ = ["AgievalExtractor"]
-_LOG = setup_logger(__name__)
-task_names = ("agieval",)
-class AgievalExtractor(LMEvalBenchmarkExtractor):
-    """Extractor for the Agieval benchmark."""
-    evaluator_name = "exact_match"
-    def extract_contrastive_pairs(
-        self,
-        lm_eval_task_data: ConfigurableTask,
-        limit: int | None = None,
-        preferred_doc: str | None = None,
-    ) -> list[ContrastivePair]:
-        """
-        Build contrastive pairs from Agieval docs.
-        Args:
-            lm_eval_task_data: lm-eval task instance for Agieval.
-            limit: Optional maximum number of pairs to produce.
-            preferred_doc: Optional preferred document source.
-        Returns:
-            A list of ContrastivePair objects.
-        """
-        log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
-        max_items = self._normalize_limit(limit)
-        docs = self.load_docs(lm_eval_task_data, max_items, preferred_doc=preferred_doc)
-        pairs: list[ContrastivePair] = []
-        log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
-        for doc in docs:
-            pair = self._extract_pair_from_doc(doc)
-            if pair is not None:
-                pairs.append(pair)
-                if max_items is not None and len(pairs) >= max_items:
-                    break
-        if not pairs:
-            task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
-            log.warning("No valid Agieval pairs extracted", extra={"task": task_name})
-        return pairs
-    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
-        """
-        Convert a single Agieval doc into a ContrastivePair, if possible.
-        Returns None when required fields are missing or malformed.
-        """
-        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
-        try:
-            # Try multiple possible schema formats
-            question = None
-            choices = None
-            answer_idx = None
-            # Format 1: question + choices + answer
-            if "question" in doc and "choices" in doc:
-                question = str(doc.get("question", "")).strip()
-                choices_data = doc.get("choices", {})
-                if isinstance(choices_data, dict):
-                    choices = choices_data.get("text", [])
-                elif isinstance(choices_data, list):
-                    choices = choices_data
-                answer = doc.get("answer", doc.get("answerKey", ""))
-                if isinstance(answer, str) and len(answer) == 1 and answer.isalpha():
-                    answer_idx = ord(answer.upper()) - ord('A')
-                else:
-                    answer_idx = int(answer) if answer else 0
-            # Format 2: instruction + option_a/b/c/d + answer (MMMLU style)
-            elif "instruction" in doc and "option_a" in doc:
-                question = str(doc.get("instruction", "")).strip()
-                choices = [
-                    str(doc.get("option_a", "")).strip(),
-                    str(doc.get("option_b", "")).strip(),
-                    str(doc.get("option_c", "")).strip(),
-                    str(doc.get("option_d", "")).strip(),
-                ]
-                choices = [c for c in choices if c]
-                answer = doc.get("answer", "A")
-                answer_idx = ord(str(answer).upper()) - ord('A')
-            # Format 3: query/prompt + answer
-            elif "query" in doc or "prompt" in doc:
-                question = str(doc.get("query", doc.get("prompt", ""))).strip()
-                # For open-ended questions, use target as correct answer
-                correct_answer = str(doc.get("target", doc.get("answer", ""))).strip()
-                if correct_answer:
-                    metadata = {"label": "agieval"}
-                    return self._build_pair(
-                        question=f"Question: {question}",
-                        correct=correct_answer,
-                        incorrect="incorrect answer",
-                        metadata=metadata,
-                    )
-                return None
-            if not question or not choices or answer_idx is None or not (0 <= answer_idx < len(choices)):
-                log.debug(
-                    "Skipping doc due to missing/invalid fields",
-                    extra={"doc": doc},
-                )
-                return None
-            correct = choices[answer_idx]
-            incorrect_idx = (answer_idx + 1) % len(choices)
-            incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
-            metadata = {
-                "label": "agieval",
-            }
-            return self._build_pair(
-                question=formatted_question,
-                correct=correct,
-                incorrect=incorrect,
-                metadata=metadata,
-            )
-        except Exception as exc:
-            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
-            return None
-    @staticmethod
-    def _build_pair(
-        question: str,
-        correct: str,
-        incorrect: str,
-        metadata: dict[str, Any] | None = None,
-    ) -> ContrastivePair:
-        positive_response = PositiveResponse(model_response=correct)
-        negative_response = NegativeResponse(model_response=incorrect)
-        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))

wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl