PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py CHANGED Viewed

@@ -116,7 +116,9 @@ EXTRACTORS.update({
     # acp_bench subtasks (bool and mcq use log_likelihoods)
     # acp_bench_hard _gen subtasks (use generation evaluator)
     "aexams": f"{base_import}aexams:AexamsExtractor",
-    "agieval": f"{base_import}agieval:AgievalExtractor",
+    # AfroBench multiple-choice benchmarks
+    "afrobench": f"{base_import}afrobench_mc:AfroBenchMultipleChoiceExtractor",
+    "afridiacritics": f"{base_import}afrobench_mc:AfroBenchMultipleChoiceExtractor",
     "arabic_exams": f"{base_import}arabic_exams:ArabicExamsExtractor",
     "arabic_leaderboard_complete": f"{base_import}arabic_leaderboard_complete:ArabicLeaderboardCompleteExtractor",
     "arabic_leaderboard_light": f"{base_import}arabic_leaderboard_light:ArabicLeaderboardLightExtractor",
@@ -140,8 +142,6 @@ EXTRACTORS.update({
     "chartqa": f"{base_import}chartqa:ChartqaExtractor",
     "click": f"{base_import}click:ClickExtractor",
     "cmmlu": f"{base_import}cmmlu:CmmluExtractor",
-    "code_x_glue": f"{base_import}code_x_glue:CodeXGlueExtractor",
-    "codexglue_code2text": f"{base_import}code_x_glue:CodeXGlueExtractor",
     "commonsense_qa": f"{base_import}commonsense_qa:CommonsenseQAExtractor",
     "copa": f"{base_import}copa:COPAExtractor",
     "copal_id": f"{base_import}copal_id:CopalIdExtractor",
@@ -164,6 +164,7 @@ EXTRACTORS.update({
     "evalita-sp_sum_task_fp_p2": f"{base_import}evalita_sp:EvalitaSpExtractor",
     "fda": f"{base_import}fda:FdaExtractor",
     "fld": f"{base_import}fld:FldExtractor",
+    "freebase": f"{base_import}webqs:WebQSExtractor",
     "french_bench": f"{base_import}french_bench:FrenchBenchExtractor",
     "galician_bench": f"{base_import}galician_bench:GalicianBenchExtractor",
     "global_mmlu": f"{base_import}global_mmlu:GlobalMmluExtractor",
@@ -211,6 +212,9 @@ EXTRACTORS.update({
     "mastermind_46_easy": f"{base_import}mastermind:MastermindExtractor",
     "mastermind_46_hard": f"{base_import}mastermind:MastermindExtractor",
     "mbpp": f"{base_import}mbpp:MBPPExtractor",
+    "mbpp_instruct": f"{base_import}mbpp:MBPPExtractor",
+    "mbpp_plus": f"{base_import}mbpp:MBPPExtractor",
+    "mbpp_plus_instruct": f"{base_import}mbpp:MBPPExtractor",
     "mc-taco": f"{base_import}mc-taco:MCTACOExtractor",
     "meddialog": f"{base_import}meddialog:MeddialogExtractor",
     "meddialog_qsumm": f"{base_import}meddialog:MeddialogExtractor",
@@ -265,7 +269,9 @@ EXTRACTORS.update({
     "niah_multiquery": f"{base_import}ruler:RulerExtractor",
     "niah_multivalue": f"{base_import}ruler:RulerExtractor",
     "score": f"{base_import}score:ScoreExtractor",
-    "option_order_robustness_agieval_aqua_rat": f"{base_import}score:ScoreExtractor",
+    "prompt_robustness_agieval_aqua_rat": f"{base_import}agieval_aqua_rat:AgievalAquaRatExtractor",
+    "option_order_robustness_agieval_aqua_rat": f"{base_import}agieval_aqua_rat:AgievalAquaRatExtractor",
+    "non_greedy_robustness_agieval_aqua_rat": f"{base_import}agieval_aqua_rat:AgievalAquaRatExtractor",
     "option_order_robustness_agieval_logiqa_en": f"{base_import}score:ScoreExtractor",
     "option_order_robustness_agieval_lsat_ar": f"{base_import}score:ScoreExtractor",
     "option_order_robustness_agieval_lsat_lr": f"{base_import}score:ScoreExtractor",
@@ -380,7 +386,7 @@ EXTRACTORS.update({
     "phrases_va": f"{base_import}phrases:PhrasesExtractor",
     "phrases_va-ca": f"{base_import}phrases:PhrasesExtractor",
     "phrases_va-es": f"{base_import}phrases:PhrasesExtractor",
-    "code2text": f"{base_import}code2text:Code2textExtractor",
+    "code2text": f"{base_import}code_x_glue:Code2TextExtractor",
     "ethics": f"{base_import}ethics:EthicsExtractor",
     "cabreu": f"{base_import}cabreu:CabreuExtractor",
     "sycophancy": f"{base_import}sycophancy:SycophancyExtractor",

wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py CHANGED Viewed

@@ -90,11 +90,29 @@ def get_extractor(task_name: str) -> LMEvalBenchmarkExtractor:
     if not key:
         raise UnsupportedLMEvalBenchmarkError("Empty task name is not supported.")
-    # Exact match only - no prefix matching
+    # Try exact match first
     ref = _REGISTRY.get(key)
     if ref:
         return _instantiate(ref)
+    # Try prefix matching for hierarchical task names
+    # This handles cases like AraDiCE_ArabicMMLU_high_humanities_history_lev -> aradice
+    # Sort prefixes by length descending to match longest prefix first
+    PREFIX_FALLBACKS = {
+        "aradice_": "aradice",
+        "aexams_": "aexams",
+        "afrimgsm_": "afrimgsm",
+        "afrimmlu_": "afrimmlu",
+        "afrobench_": "afrobench",
+        "afridiacritics_": "afrobench",
+        "mmlu_": "mmlu",
+        "bigbench_": "bigbench",
+    }
+    for prefix, fallback_key in PREFIX_FALLBACKS.items():
+        if key.startswith(prefix) and fallback_key in _REGISTRY:
+            LOG.info(f"Using prefix fallback: '{task_name}' -> '{fallback_key}'")
+            return _instantiate(_REGISTRY[fallback_key])
     raise UnsupportedLMEvalBenchmarkError(
         f"No extractor registered for task '{task_name}'. "
         f"Known: {', '.join(sorted(_REGISTRY)) or '(none)'}"

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py CHANGED Viewed

@@ -142,14 +142,12 @@ class AclueExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "aclue",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py CHANGED Viewed

@@ -178,14 +178,12 @@ class AcpBenchExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "acp_bench",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py CHANGED Viewed

@@ -156,14 +156,12 @@ class AcpBenchHardExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "acp_bench_hard",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py CHANGED Viewed

@@ -66,7 +66,7 @@ class AdvancedExtractor(LMEvalBenchmarkExtractor):
                 metadata = {"label": "advanced_ai_risk"}
                 return self._build_pair(
-                    question=formatted_question,
+                    question=question,
                     correct=correct,
                     incorrect=incorrect,
                     metadata=metadata,
@@ -103,12 +103,10 @@ class AdvancedExtractor(LMEvalBenchmarkExtractor):
             correct = str(choices[answer_idx]).strip()
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = str(choices[incorrect_idx]).strip()
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {"label": "advanced"}
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py CHANGED Viewed

@@ -155,14 +155,12 @@ class AexamsExtractor(LMEvalBenchmarkExtractor):
                 )
                 return None
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "aexams",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py CHANGED Viewed

@@ -86,12 +86,10 @@ class AfrimmluExtractor(LMEvalBenchmarkExtractor):
             correct = str(choices[answer_idx]).strip()
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = str(choices[incorrect_idx]).strip()
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {"label": "afrimmlu"}
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py CHANGED Viewed

@@ -101,8 +101,8 @@ class AfrixnliExtractor(LMEvalBenchmarkExtractor):
                 return None
             incorrect = label_map[incorrect_labels[0]]
-            # Format the NLI prompt
-            prompt = f"Premise: {premise}\nHypothesis: {hypothesis}.\nA. {incorrect}\nB. {correct}"
+            # Raw prompt without A./B. formatting
+            prompt = f"Premise: {premise}\nHypothesis: {hypothesis}"
             metadata = {"label": "afrixnli"}

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py ADDED Viewed

@@ -0,0 +1,129 @@
+from __future__ import annotations
+from typing import Any, TYPE_CHECKING
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
+from wisent.core.cli_logger import setup_logger, bind
+if TYPE_CHECKING:
+    from lm_eval.api.task import ConfigurableTask
+__all__ = ["AgievalAquaRatExtractor"]
+_LOG = setup_logger(__name__)
+task_names = ("prompt_robustness_agieval_aqua_rat", "option_order_robustness_agieval_aqua_rat", "non_greedy_robustness_agieval_aqua_rat")
+class AgievalAquaRatExtractor(LMEvalBenchmarkExtractor):
+    """Extractor for AGIEval AQUA-RAT robustness benchmarks (prompt, option order, non-greedy)."""
+    evaluator_name = "generation"
+    def extract_contrastive_pairs(
+        self,
+        lm_eval_task_data: ConfigurableTask,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from PIQA docs.
+        PIQA schema:
+            - question: str
+            - choices: list
+            - gold: list
+        Args:
+            lm_eval_task_data: lm-eval task instance for prompt_robustness_agieval_aqua_rat.
+            limit: Optional maximum number of pairs to produce.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
+        max_items = self._normalize_limit(limit)
+        docs = self.load_docs(lm_eval_task_data, max_items)
+        pairs: list[ContrastivePair] = []
+        log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
+        for doc in docs:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
+            log.warning("No valid prompt_robustness_agieval_aqua_rat pairs extracted", extra={"task": task_name})
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single prompt_robustness_agieval_aqua_rat doc into a ContrastivePair, if possible.
+        Returns None when required fields are missing or malformed.
+        """
+        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
+        try:
+            question = doc.get("question", "").strip()
+            choices = doc.get("choices", [])
+            options = doc.get("options", [])
+            gold = doc.get("gold", [])
+            if not question or not choices or not options or not gold:
+                log.debug(
+                    "Skipping doc due to missing/invalid fields",
+                    extra={"doc": doc},
+                )
+                return None
+            # Use letter answers (A, B, C, D, E)
+            incorrect_map = {"A": "B", "B": "C", "C": "D", "D": "E", "E": "A"}
+            correct_letter = doc.get("answer", "")
+            incorrect_letter = incorrect_map.get(correct_letter, "B")
+            correct = f"The best answer is {correct_letter}"
+            incorrect = f"The best answer is {incorrect_letter}"
+            choices_str = "\n".join(choices)
+            formatted_question = f"""{question}
+{choices_str}
+Examine the question and choose the correct answer from the options 'A', 'B', 'C', 'D' or 'E'. End your answer with:
+The best answer is [the_answer_letter].
+where the [the_answer_letter] is a letter from A to E."""
+            metadata = {
+                "label": "agieval_aqua_rat",
+            }
+            return self._build_pair(
+                question=formatted_question,
+                correct=correct,
+                incorrect=incorrect,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
+            return None
+    @staticmethod
+    def _build_pair(
+        question: str,
+        correct: str,
+        incorrect: str,
+        metadata: dict[str, Any] | None = None,
+    ) -> ContrastivePair:
+        positive_response = PositiveResponse(model_response=correct)
+        negative_response = NegativeResponse(model_response=incorrect)
+        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py CHANGED Viewed

@@ -151,14 +151,12 @@ class ArabcultureExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "arabculture",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py CHANGED Viewed

@@ -71,12 +71,10 @@ class ArabicExtractor(LMEvalBenchmarkExtractor):
             correct = choices[answer_idx]
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {"label": "arabic"}
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py CHANGED Viewed

@@ -77,12 +77,10 @@ class ArabicExamsExtractor(LMEvalBenchmarkExtractor):
             correct = str(choices[answer_idx]).strip()
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = str(choices[incorrect_idx]).strip()
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {"label": "arabic_exams"}
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py CHANGED Viewed

@@ -139,14 +139,12 @@ class ArabicLeaderboardCompleteExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "arabic_leaderboard_complete",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py CHANGED Viewed

@@ -139,14 +139,12 @@ class ArabicLeaderboardLightExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "arabic_leaderboard_light",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py CHANGED Viewed

@@ -138,14 +138,12 @@ class ArabicmmluExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "arabicmmlu",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py CHANGED Viewed

@@ -241,12 +241,10 @@ class AradiceExtractor(LMEvalBenchmarkExtractor):
             correct = str(choices[answer_idx]).strip()
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = str(choices[incorrect_idx]).strip()
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {"label": "aradice"}
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py CHANGED Viewed

@@ -106,12 +106,10 @@ class ArcExtractor(LMEvalBenchmarkExtractor):
                     extra={"doc": doc},
                 )
                 return None
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {"label": "arc"}
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py CHANGED Viewed

@@ -89,14 +89,13 @@ class ArcChallengeExtractor(LMEvalBenchmarkExtractor):
             incorrect = choices[(answer_idx+1)%len(choices)]
             question = f"{question}"
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "arc_easy",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py CHANGED Viewed

@@ -89,14 +89,13 @@ class ArcEasyExtractor(LMEvalBenchmarkExtractor):
             incorrect = choices[(answer_idx+1)%len(choices)]
             question = f"{question}"
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "arc_easy",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py CHANGED Viewed

@@ -85,14 +85,14 @@ class ArithmeticExtractor(LMEvalBenchmarkExtractor):
             incorrect_val = float(completion) + 1
             incorrect = str(int(incorrect_val)) if incorrect_val == int(incorrect_val) else str(incorrect_val)
-            formatted_question = f"{context}\nA. {incorrect}\nB. {correct}"
+            prompt = f"{context}"
             metadata = {
                 "label": "arithmetic",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=prompt,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py CHANGED Viewed

@@ -93,14 +93,14 @@ class ASDivExtractor(LMEvalBenchmarkExtractor):
             incorrect_val = float(numerical_answer) + 1
             incorrect = str(int(incorrect_val)) if incorrect_val == int(incorrect_val) else str(incorrect_val)
-            formatted_question = f"{body}\nQuestion:{question}\nA. {incorrect}\nB. {correct}"
+            prompt = f"{body}\nQuestion:{question}"
             metadata = {
                 "label": "asdiv",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=prompt,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py CHANGED Viewed

@@ -82,8 +82,42 @@ class BabiExtractor(LMEvalBenchmarkExtractor):
                 log.debug("Skipping doc due to missing/invalid fields", extra={"doc": doc})
                 return None
-            # Create an incorrect answer by appending "incorrect" or using a generic wrong answer
-            incorrect = f"not {correct}"
+            # Create an incorrect answer using plausible alternatives from babi vocabulary
+            import random
+            random.seed(hash(correct + passage) % (2**32))
+            # Common babi answer categories
+            locations = ['bathroom', 'bedroom', 'kitchen', 'garden', 'hallway', 'office', 'park']
+            people = ['Mary', 'John', 'Sandra', 'Daniel', 'Bill', 'Fred', 'Julie', 'Emily']
+            objects = ['football', 'apple', 'milk', 'keys', 'box', 'ball']
+            directions = ['north', 'south', 'east', 'west']
+            animals = ['cat', 'dog', 'mouse', 'wolf', 'sheep', 'lion']
+            yes_no = ['yes', 'no']
+            # Determine answer type and pick a wrong alternative
+            correct_lower = correct.lower()
+            if correct_lower in [l.lower() for l in locations]:
+                incorrect = random.choice([l for l in locations if l.lower() != correct_lower])
+            elif correct_lower in [p.lower() for p in people]:
+                incorrect = random.choice([p for p in people if p.lower() != correct_lower])
+            elif correct_lower in [o.lower() for o in objects]:
+                incorrect = random.choice([o for o in objects if o.lower() != correct_lower])
+            elif correct_lower in [d.lower() for d in directions]:
+                incorrect = random.choice([d for d in directions if d.lower() != correct_lower])
+            elif correct_lower in [a.lower() for a in animals]:
+                incorrect = random.choice([a for a in animals if a.lower() != correct_lower])
+            elif correct_lower in yes_no:
+                incorrect = 'no' if correct_lower == 'yes' else 'yes'
+            elif correct.isdigit():
+                num = int(correct)
+                incorrect = str(random.choice([n for n in [num-1, num+1, num*2] if n != num and n >= 0]))
+            else:
+                # Fallback: use a generic wrong answer from the passage words
+                passage_words = [w for w in passage.split() if len(w) > 3 and w.isalpha() and w.lower() != correct_lower]
+                if passage_words:
+                    incorrect = random.choice(passage_words)
+                else:
+                    incorrect = "unknown"
             # Format the prompt with passage and question
             prompt = f"Passage: {passage}\n\nQuestion: {question}"

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py CHANGED Viewed

@@ -126,14 +126,12 @@ class BasqueBenchExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "basque_bench",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py CHANGED Viewed

@@ -140,14 +140,12 @@ class BbqExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "bbq",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py CHANGED Viewed

@@ -152,14 +152,12 @@ class BelebeleExtractor(LMEvalBenchmarkExtractor):
                 )
                 return None
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "belebele",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py CHANGED Viewed

@@ -126,14 +126,12 @@ class BenchmarksExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "benchmarks",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl