PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (725) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py CHANGED Viewed

@@ -6,45 +6,52 @@ from wisent.core.cli_logger import setup_logger
 from wisent.core.contrastive_pairs.core.pair import ContrastivePair
 from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
-__all__ = ["HumanEvalExtractor"]
+__all__ = [
+    "HumanEvalUnifiedExtractor",
+    "HumanEvalExtractor",
+    "HumanEval64Extractor",
+    "HumanEvalPlusExtractor",
+    "HumanEvalInstructExtractor",
+    "HumanEval64InstructExtractor",
+]
 log = setup_logger(__name__)
-task_names = ("humaneval_64_instruct", "humaneval_instruct", "humaneval_plus")
+# Tasks supported by this extractor
+task_names = (
+    "humaneval",
+    "humaneval_64",
+    "humaneval_plus",
+    "humaneval_instruct",
+    "humaneval_64_instruct",
+)
-class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
-    evaluator_name = "coding"
+class HumanEvalUnifiedExtractor(HuggingFaceBenchmarkExtractor):
     """
-    Extractor for HumanEval and HumanEval+ coding benchmarks.
-    HumanEval schema (openai_humaneval):
-        - task_id: str (e.g., "HumanEval/0")
-        - prompt: str (function signature + docstring)
-        - canonical_solution: str (correct implementation)
-        - test: str (unit tests)
-        - entry_point: str (function name)
+    Unified extractor for all HumanEval variants.
+    Supports:
+        - humaneval, humaneval_64, humaneval_plus: raw prompt format
+        - humaneval_instruct, humaneval_64_instruct: instruction format
+    Dataset: openai_humaneval (164 Python problems)
     """
+    evaluator_name = "coding"
+    # Override in subclasses
+    task_name = "humaneval"
+    is_instruct = False
     def extract_contrastive_pairs(
         self,
         limit: int | None = None,
+        **kwargs,
     ) -> list[ContrastivePair]:
-        """
-        Build contrastive pairs from HumanEval examples.
-        For coding tasks, we create pairs where:
-        - Positive: Correct implementation
-        - Negative: Incorrect implementation (generated or placeholder)
-        Args:
-            limit: Optional maximum number of pairs to produce.
-        Returns:
-            A list of ContrastivePair objects.
-        """
+        """Build contrastive pairs from HumanEval examples."""
         max_items = self._normalize_limit(limit)
-        # Load HumanEval dataset
         docs = self.load_dataset(
             dataset_name="openai_humaneval",
             split="test",
@@ -52,8 +59,7 @@ class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
         )
         pairs: list[ContrastivePair] = []
-        log.info(f"Extracting contrastive pairs from {len(docs)} HumanEval examples")
+        log.info(f"Extracting {self.task_name} pairs from {len(docs)} examples")
         for doc in docs:
             pair = self._extract_pair_from_doc(doc)
@@ -63,53 +69,41 @@ class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
                     break
         if not pairs:
-            log.warning("No valid HumanEval pairs extracted")
+            log.warning(f"No valid {self.task_name} pairs extracted")
         return pairs
     def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
-        """
-        Convert a single HumanEval doc into a ContrastivePair.
-        Returns None when required fields are missing or malformed.
-        """
+        """Convert a single HumanEval doc into a ContrastivePair."""
         try:
-            task_id = doc.get("task_id", "")
-            prompt = doc.get("prompt", "").strip()
-            canonical_solution = doc.get("canonical_solution", "").strip()
+            prompt = doc.get("prompt", "")
+            body = doc.get("canonical_solution", "")
             entry_point = doc.get("entry_point", "")
             test_code = doc.get("test", "").strip()
-            if not prompt or not canonical_solution:
-                log.debug(f"Skipping {task_id}: missing prompt or solution")
+            if not prompt or not body:
+                log.debug("Skipping: missing prompt or solution")
                 return None
-            # Construct the full correct implementation
-            # In HumanEval, prompt contains signature+docstring, canonical_solution is body at column 0
-            # We need to indent the body by 4 spaces to be inside the function
-            lines = canonical_solution.split('\n')
-            indented_lines = ['    ' + line if line and not line[0].isspace() else line for line in lines]
-            indented_solution = '\n'.join(indented_lines)
-            correct_code = prompt + "\n" + indented_solution
+            # Build complete function: prompt (signature + docstring) + body
+            correct = prompt + body
-            # Create an incorrect implementation (return incorrect value/type)
-            # For coding benchmarks, we create a simple buggy version
-            incorrect_code = prompt + "\n    pass  # Incorrect: empty implementation"
+            # Build incorrect answer: prompt + pass
+            incorrect = self._create_incorrect_answer(prompt)
-            # Format the question to include the task
-            question = f"Complete the following Python function:\n\n{prompt}"
+            # Format question based on task type just like lm eval harness does it
+            formatted_question = self._format_question(prompt)
             metadata = {
-                "label": "humaneval",
-                "task_id": task_id,
+                "label": self.task_name,
                 "entry_point": entry_point,
-                "test_code": test_code,  # Include test code for execution
+                "test_code": test_code,
             }
             return self._build_pair(
-                question=question,
-                correct=correct_code,
-                incorrect=incorrect_code,
+                question=formatted_question,
+                correct=correct,
+                incorrect=incorrect,
                 metadata=metadata,
             )
@@ -117,3 +111,50 @@ class HumanEvalExtractor(HuggingFaceBenchmarkExtractor):
             log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
             return None
+    def _format_question(self, prompt: str) -> str:
+        """Format the question based on task type."""
+        if self.is_instruct:
+            # lm_eval instruction format
+            return (
+                "Write me a solution to the following problem and make sure "
+                "that it passes the tests:\n"
+                f"```python\n{prompt}\n```"
+            )
+        else:
+            # Raw prompt for base models
+            return prompt
+    def _create_incorrect_answer(self, correct: str) -> str:
+        """Create an incorrect answer by modifying the correct one."""
+        # For code, corrupt it slightly
+        if len(correct) > 10:
+            return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
+        return f"{correct} # INCORRECT"
+# Subclasses for each task variant
+class HumanEvalExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval"
+    is_instruct = False
+class HumanEval64Extractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_64"
+    is_instruct = False
+class HumanEvalPlusExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_plus"
+    is_instruct = False
+class HumanEvalInstructExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_instruct"
+    is_instruct = True
+class HumanEval64InstructExtractor(HumanEvalUnifiedExtractor):
+    task_name = "humaneval_64_instruct"
+    is_instruct = True

wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Code x glue group task manifest."""
+"""Code2Text group task manifest."""
 from __future__ import annotations
@@ -6,11 +6,11 @@ BASE_IMPORT = "wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_extractors."
 # Code2Text tasks in lm-eval-harness
 CODE_X_GLUE_TASKS = {
-    "code_x_glue": f"{BASE_IMPORT}code_x_glue:CodeXGlueExtractor",
-    "code2text_go": f"{BASE_IMPORT}code_x_glue:CodeXGlueExtractor",
-    "code2text_java": f"{BASE_IMPORT}code_x_glue:CodeXGlueExtractor",
-    "code2text_javascript": f"{BASE_IMPORT}code_x_glue:CodeXGlueExtractor",
-    "code2text_php": f"{BASE_IMPORT}code_x_glue:CodeXGlueExtractor",
-    "code2text_python": f"{BASE_IMPORT}code_x_glue:CodeXGlueExtractor",
-    "code2text_ruby": f"{BASE_IMPORT}code_x_glue:CodeXGlueExtractor",
+    "code2text": f"{BASE_IMPORT}code_x_glue:Code2TextExtractor",
+    "code2text_go": f"{BASE_IMPORT}code_x_glue:Code2TextExtractor",
+    "code2text_java": f"{BASE_IMPORT}code_x_glue:Code2TextExtractor",
+    "code2text_javascript": f"{BASE_IMPORT}code_x_glue:Code2TextExtractor",
+    "code2text_php": f"{BASE_IMPORT}code_x_glue:Code2TextExtractor",
+    "code2text_python": f"{BASE_IMPORT}code_x_glue:Code2TextExtractor",
+    "code2text_ruby": f"{BASE_IMPORT}code_x_glue:Code2TextExtractor",
 }

wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py CHANGED Viewed

@@ -5,5 +5,5 @@ from __future__ import annotations
 BASE_IMPORT = "wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_extractors."
 FREEBASE_TASKS = {
-    "freebase": f"{BASE_IMPORT}freebase:FreebaseExtractor",
+    "webqs": f"{BASE_IMPORT}webqs:WebQSExtractor",
 }

wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py CHANGED Viewed

@@ -116,7 +116,6 @@ EXTRACTORS.update({
     # acp_bench subtasks (bool and mcq use log_likelihoods)
     # acp_bench_hard _gen subtasks (use generation evaluator)
     "aexams": f"{base_import}aexams:AexamsExtractor",
-    "agieval": f"{base_import}agieval:AgievalExtractor",
     "arabic_exams": f"{base_import}arabic_exams:ArabicExamsExtractor",
     "arabic_leaderboard_complete": f"{base_import}arabic_leaderboard_complete:ArabicLeaderboardCompleteExtractor",
     "arabic_leaderboard_light": f"{base_import}arabic_leaderboard_light:ArabicLeaderboardLightExtractor",
@@ -140,8 +139,6 @@ EXTRACTORS.update({
     "chartqa": f"{base_import}chartqa:ChartqaExtractor",
     "click": f"{base_import}click:ClickExtractor",
     "cmmlu": f"{base_import}cmmlu:CmmluExtractor",
-    "code_x_glue": f"{base_import}code_x_glue:CodeXGlueExtractor",
-    "codexglue_code2text": f"{base_import}code_x_glue:CodeXGlueExtractor",
     "commonsense_qa": f"{base_import}commonsense_qa:CommonsenseQAExtractor",
     "copa": f"{base_import}copa:COPAExtractor",
     "copal_id": f"{base_import}copal_id:CopalIdExtractor",
@@ -164,6 +161,7 @@ EXTRACTORS.update({
     "evalita-sp_sum_task_fp_p2": f"{base_import}evalita_sp:EvalitaSpExtractor",
     "fda": f"{base_import}fda:FdaExtractor",
     "fld": f"{base_import}fld:FldExtractor",
+    "freebase": f"{base_import}webqs:WebQSExtractor",
     "french_bench": f"{base_import}french_bench:FrenchBenchExtractor",
     "galician_bench": f"{base_import}galician_bench:GalicianBenchExtractor",
     "global_mmlu": f"{base_import}global_mmlu:GlobalMmluExtractor",
@@ -211,6 +209,9 @@ EXTRACTORS.update({
     "mastermind_46_easy": f"{base_import}mastermind:MastermindExtractor",
     "mastermind_46_hard": f"{base_import}mastermind:MastermindExtractor",
     "mbpp": f"{base_import}mbpp:MBPPExtractor",
+    "mbpp_instruct": f"{base_import}mbpp:MBPPExtractor",
+    "mbpp_plus": f"{base_import}mbpp:MBPPExtractor",
+    "mbpp_plus_instruct": f"{base_import}mbpp:MBPPExtractor",
     "mc-taco": f"{base_import}mc-taco:MCTACOExtractor",
     "meddialog": f"{base_import}meddialog:MeddialogExtractor",
     "meddialog_qsumm": f"{base_import}meddialog:MeddialogExtractor",
@@ -265,7 +266,9 @@ EXTRACTORS.update({
     "niah_multiquery": f"{base_import}ruler:RulerExtractor",
     "niah_multivalue": f"{base_import}ruler:RulerExtractor",
     "score": f"{base_import}score:ScoreExtractor",
-    "option_order_robustness_agieval_aqua_rat": f"{base_import}score:ScoreExtractor",
+    "prompt_robustness_agieval_aqua_rat": f"{base_import}agieval_aqua_rat:AgievalAquaRatExtractor",
+    "option_order_robustness_agieval_aqua_rat": f"{base_import}agieval_aqua_rat:AgievalAquaRatExtractor",
+    "non_greedy_robustness_agieval_aqua_rat": f"{base_import}agieval_aqua_rat:AgievalAquaRatExtractor",
     "option_order_robustness_agieval_logiqa_en": f"{base_import}score:ScoreExtractor",
     "option_order_robustness_agieval_lsat_ar": f"{base_import}score:ScoreExtractor",
     "option_order_robustness_agieval_lsat_lr": f"{base_import}score:ScoreExtractor",
@@ -380,7 +383,7 @@ EXTRACTORS.update({
     "phrases_va": f"{base_import}phrases:PhrasesExtractor",
     "phrases_va-ca": f"{base_import}phrases:PhrasesExtractor",
     "phrases_va-es": f"{base_import}phrases:PhrasesExtractor",
-    "code2text": f"{base_import}code2text:Code2textExtractor",
+    "code2text": f"{base_import}code_x_glue:Code2TextExtractor",
     "ethics": f"{base_import}ethics:EthicsExtractor",
     "cabreu": f"{base_import}cabreu:CabreuExtractor",
     "sycophancy": f"{base_import}sycophancy:SycophancyExtractor",

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py ADDED Viewed

@@ -0,0 +1,129 @@
+from __future__ import annotations
+from typing import Any, TYPE_CHECKING
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
+from wisent.core.cli_logger import setup_logger, bind
+if TYPE_CHECKING:
+    from lm_eval.api.task import ConfigurableTask
+__all__ = ["AgievalAquaRatExtractor"]
+_LOG = setup_logger(__name__)
+task_names = ("prompt_robustness_agieval_aqua_rat", "option_order_robustness_agieval_aqua_rat", "non_greedy_robustness_agieval_aqua_rat")
+class AgievalAquaRatExtractor(LMEvalBenchmarkExtractor):
+    """Extractor for AGIEval AQUA-RAT robustness benchmarks (prompt, option order, non-greedy)."""
+    evaluator_name = "generation"
+    def extract_contrastive_pairs(
+        self,
+        lm_eval_task_data: ConfigurableTask,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from PIQA docs.
+        PIQA schema:
+            - question: str
+            - choices: list
+            - gold: list
+        Args:
+            lm_eval_task_data: lm-eval task instance for prompt_robustness_agieval_aqua_rat.
+            limit: Optional maximum number of pairs to produce.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
+        max_items = self._normalize_limit(limit)
+        docs = self.load_docs(lm_eval_task_data, max_items)
+        pairs: list[ContrastivePair] = []
+        log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
+        for doc in docs:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
+            log.warning("No valid prompt_robustness_agieval_aqua_rat pairs extracted", extra={"task": task_name})
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single prompt_robustness_agieval_aqua_rat doc into a ContrastivePair, if possible.
+        Returns None when required fields are missing or malformed.
+        """
+        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
+        try:
+            question = doc.get("question", "").strip()
+            choices = doc.get("choices", [])
+            options = doc.get("options", [])
+            gold = doc.get("gold", [])
+            if not question or not choices or not options or not gold:
+                log.debug(
+                    "Skipping doc due to missing/invalid fields",
+                    extra={"doc": doc},
+                )
+                return None
+            # Use letter answers (A, B, C, D, E)
+            incorrect_map = {"A": "B", "B": "C", "C": "D", "D": "E", "E": "A"}
+            correct_letter = doc.get("answer", "")
+            incorrect_letter = incorrect_map.get(correct_letter, "B")
+            correct = f"The best answer is {correct_letter}"
+            incorrect = f"The best answer is {incorrect_letter}"
+            choices_str = "\n".join(choices)
+            formatted_question = f"""{question}
+{choices_str}
+Examine the question and choose the correct answer from the options 'A', 'B', 'C', 'D' or 'E'. End your answer with:
+The best answer is [the_answer_letter].
+where the [the_answer_letter] is a letter from A to E."""
+            metadata = {
+                "label": "agieval_aqua_rat",
+            }
+            return self._build_pair(
+                question=formatted_question,
+                correct=correct,
+                incorrect=incorrect,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
+            return None
+    @staticmethod
+    def _build_pair(
+        question: str,
+        correct: str,
+        incorrect: str,
+        metadata: dict[str, Any] | None = None,
+    ) -> ContrastivePair:
+        positive_response = PositiveResponse(model_response=correct)
+        negative_response = NegativeResponse(model_response=incorrect)
+        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py CHANGED Viewed

@@ -11,16 +11,21 @@ if TYPE_CHECKING:
     from lm_eval.api.task import ConfigurableTask
-__all__ = ["CodeXGlueExtractor"]
+__all__ = ["Code2TextExtractor"]
 _LOG = setup_logger(__name__)
 task_names = (
-    "code_x_glue",
-    "codexglue_code2text",
+    "code2text",
+    "code2text_go",
+    "code2text_java",
+    "code2text_javascript",
+    "code2text_php",
+    "code2text_python",
+    "code2text_ruby"
 )
-class CodeXGlueExtractor(LMEvalBenchmarkExtractor):
-    """Extractor for the Code X Glue benchmark - parent task for code2text subtasks."""
+class Code2TextExtractor(LMEvalBenchmarkExtractor):
+    """Extractor for code2text tasks - generates documentation from code."""
     evaluator_name = "generation"
@@ -87,7 +92,7 @@ class CodeXGlueExtractor(LMEvalBenchmarkExtractor):
                 prompt = f"Generate documentation for this code:\n\n{code}\n\nDocumentation:"
-                metadata = {"label": "code_x_glue"}
+                metadata = {"label": "code2text"}
                 return self._build_pair(
                     question=prompt,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py CHANGED Viewed

@@ -33,7 +33,7 @@ class GSM8KExtractor(LMEvalBenchmarkExtractor):
     """Extractor for the GSM8K benchmark - grade school math word problems."""
-    evaluator_name = "log_likelihoods"
+    evaluator_name = "generation"
     def extract_contrastive_pairs(
         self,
         lm_eval_task_data: ConfigurableTask,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import re
 from typing import Any, TYPE_CHECKING
 from wisent.core.contrastive_pairs.core.pair import ContrastivePair
@@ -16,14 +17,16 @@ _LOG = setup_logger(__name__)
 task_names = (
     "mbpp",
+    "mbpp_instruct",
     "mbpp_plus",
+    "mbpp_plus_instruct",
 )
 class MBPPExtractor(LMEvalBenchmarkExtractor):
     """Extractor for the MBPP (Mostly Basic Python Problems) benchmark."""
-    evaluator_name = "exact_match"
+    evaluator_name = "coding"
     def extract_contrastive_pairs(
         self,
         lm_eval_task_data: ConfigurableTask,
@@ -57,8 +60,9 @@ class MBPPExtractor(LMEvalBenchmarkExtractor):
         log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
+        task_name = getattr(lm_eval_task_data, "NAME", "mbpp")
         for doc in docs:
-            pair = self._extract_pair_from_doc(doc)
+            pair = self._extract_pair_from_doc(doc, task_name)
             if pair is not None:
                 pairs.append(pair)
                 if max_items is not None and len(pairs) >= max_items:
@@ -70,7 +74,7 @@ class MBPPExtractor(LMEvalBenchmarkExtractor):
         return pairs
-    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+    def _extract_pair_from_doc(self, doc: dict[str, Any], task_name: str) -> ContrastivePair | None:
         """
         Convert a single MBPP doc into a ContrastivePair, if possible.
         Returns None when required fields are missing or malformed.
@@ -80,6 +84,7 @@ class MBPPExtractor(LMEvalBenchmarkExtractor):
         try:
             text = str(doc.get("text", "")).strip()
             code = str(doc.get("code", "")).strip()
+            test_list = doc.get("test_list", [])
             if not text or not code:
                 log.debug(
@@ -94,10 +99,40 @@ class MBPPExtractor(LMEvalBenchmarkExtractor):
             # Incorrect solution: return a placeholder or buggy implementation
             incorrect = "    return None  # Incomplete implementation"
-            formatted_question = f"Write a Python function to solve this problem:\n\n{text}"
+            # Format tests (use first 3 if available)
+            tests_str = "\n".join(test_list[:3]) if test_list else ""
+            # Different prompt format for instruct vs base
+            is_instruct = "instruct" in task_name.lower()
+            if is_instruct:
+                formatted_question = f"You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n{tests_str}"
+            else:
+                formatted_question = f"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n{tests_str}\n[BEGIN]\n"
+            # Extract entry_point (function name) from first test assertion
+            entry_point = None
+            if test_list:
+                match = re.search(r'assert\s+(\w+)\(', test_list[0])
+                entry_point = match.group(1) if match else None
+            # Format test_code with check() function (like HumanEval format)
+            # Replace function name with 'candidate' in assertions
+            if test_list and entry_point:
+                # Convert "assert func_name(...)" to "assert candidate(...)"
+                converted_tests = [
+                    re.sub(rf'\b{entry_point}\b', 'candidate', test)
+                    for test in test_list
+                ]
+                test_code = f"def check(candidate):\n    " + "\n    ".join(converted_tests)
+            else:
+                test_code = ""
             metadata = {
-                "label": "mbpp",
+                "label": task_name,
+                "entry_point": entry_point,
+                "test_code": test_code,
+                "language": "python",
+                "task_name": task_name,
             }
             return self._build_pair(
@@ -120,4 +155,10 @@ class MBPPExtractor(LMEvalBenchmarkExtractor):
     ) -> ContrastivePair:
         positive_response = PositiveResponse(model_response=correct)
         negative_response = NegativeResponse(model_response=incorrect)
-        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))
+        return ContrastivePair(
+            prompt=question,
+            positive_response=positive_response,
+            negative_response=negative_response,
+            label=metadata.get("label") if metadata else None,
+            metadata=metadata,
+        )

wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl