PyPI - wisent - Versions diffs - 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.701py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (330) hide show

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py CHANGED Viewed

@@ -168,14 +168,12 @@ class XcopaExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "xcopa",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py CHANGED Viewed

@@ -120,12 +120,10 @@ class XlsumExtractor(LMEvalBenchmarkExtractor):
             correct = str(choices[answer_idx]).strip()
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = str(choices[incorrect_idx]).strip()
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {"label": "xlsum"}
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py CHANGED Viewed

@@ -102,14 +102,14 @@ class XNLIExtractor(LMEvalBenchmarkExtractor):
             correct = labels[label]
             incorrect = labels[(label+1)%3]
-            formatted_question = f"Decide the relationship of the hypothesis '{hypothesis}' to the premise '{premise}\nA. {incorrect}\nB. {correct}"
+            prompt = f"Decide the relationship of the hypothesis '{hypothesis}' to the premise '{premise}"
             metadata = {
                 "label": "xnli",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=prompt,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py CHANGED Viewed

@@ -110,7 +110,7 @@ class XquadExtractor(LMEvalBenchmarkExtractor):
                 metadata = {"label": "xquad"}
                 return self._build_pair(
-                    question=formatted_question,
+                    question=question,
                     correct=correct_answer,
                     incorrect=incorrect_answer,
                     metadata=metadata,
@@ -174,14 +174,12 @@ class XquadExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "xquad",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py CHANGED Viewed

@@ -99,15 +99,14 @@ class XStoryClozeExtractor(LMEvalBenchmarkExtractor):
             correct = endings[answer]
             incorrect = endings[(answer+1)%len(endings)]
-            formatted_question = " ".join(s.strip() for s in inputs if s)
-            formatted_question = f"{formatted_question}\n \nA. {incorrect}\nB. {correct}"
+            prompt = " ".join(s.strip() for s in inputs if s)
             metadata = {
                 "label": "xstorycloze",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=prompt,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py CHANGED Viewed

@@ -95,14 +95,14 @@ class XWinogradExtractor(LMEvalBenchmarkExtractor):
             correct = options[answer]
             incorrect = options[(answer+1)%len(options)]
-            formatted_question = f"Fill in the blank: {sentence}\nA. {incorrect}\nB. {correct}"
+            prompt = f"Fill in the blank: {sentence}"
             metadata = {
                 "label": "xwinograd",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=prompt,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py CHANGED Viewed

@@ -126,14 +126,12 @@ class ZhoblimpExtractor(LMEvalBenchmarkExtractor):
             incorrect_idx = (answer_idx + 1) % len(choices)
             incorrect = choices[incorrect_idx]
-            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
             metadata = {
                 "label": "zhoblimp",
             }
             return self._build_pair(
-                question=formatted_question,
+                question=question,
                 correct=correct,
                 incorrect=incorrect,
                 metadata=metadata,

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import random
 from typing import TYPE_CHECKING
 from wisent.core.contrastive_pairs.lm_eval_pairs.lm_extractor_registry import get_extractor
@@ -10,17 +11,178 @@ if TYPE_CHECKING:
     from lm_eval.api.task import ConfigurableTask
     from wisent.core.contrastive_pairs.core.pair import ContrastivePair
-__all__ = ["build_contrastive_pairs"]
+__all__ = ["build_contrastive_pairs", "lm_build_contrastive_pairs"]
 _LOG = setup_logger(__name__)
+def _flatten_task_dict(task_dict: dict, prefix: str = "") -> list[tuple[str, "ConfigurableTask"]]:
+    """
+    Recursively flatten nested group tasks into a list of (name, ConfigurableTask) tuples.
+    arguments:
+        task_dict: Dict of task_name -> ConfigurableTask or nested dict
+        prefix: Prefix for nested task names
+    returns:
+        List of (full_task_name, ConfigurableTask) tuples (leaf tasks only)
+    """
+    from lm_eval.api.task import ConfigurableTask
+    result = []
+    for name, task in task_dict.items():
+        full_name = f"{prefix}/{name}" if prefix else name
+        if isinstance(task, ConfigurableTask):
+            result.append((full_name, task))
+        elif isinstance(task, dict):
+            # Nested group - recurse
+            result.extend(_flatten_task_dict(task, full_name))
+    return result
+def _add_evaluator_to_pairs(
+    pairs: list["ContrastivePair"],
+    evaluator_name: str | None,
+    task_name: str,
+) -> list["ContrastivePair"]:
+    """Add evaluator_name and task_name to each pair's metadata."""
+    from dataclasses import replace
+    result = []
+    for pair in pairs:
+        metadata = dict(pair.metadata) if pair.metadata else {}
+        metadata["evaluator_name"] = evaluator_name
+        metadata["source_task"] = task_name
+        result.append(replace(pair, metadata=metadata))
+    return result
+def build_contrastive_pairs(
+    task_name: str,
+    limit: int | None = None,
+) -> list["ContrastivePair"]:
+    """
+    Unified loader for contrastive pairs - handles both HuggingFace and lm-eval tasks.
+    Automatically:
+    - Detects if task is HF or lm-eval
+    - Handles group tasks (including nested groups) by sampling from all subtasks
+    - Adds evaluator_name to each pair's metadata
+    arguments:
+        task_name:
+            Name of the benchmark/task (e.g., "winogrande", "mmlu", "humaneval").
+        limit:
+            Optional upper bound on the number of pairs to return.
+            Values <= 0 are treated as "no limit".
+    returns:
+        A list of ContrastivePair objects, each with metadata containing
+        'evaluator_name' and 'source_task'.
+    """
+    log = bind(_LOG, task=task_name or "unknown")
+    log.info("Building contrastive pairs (unified)", extra={"limit": limit})
+    # Normalize limit
+    max_items = None if (limit is None or limit <= 0) else int(limit)
+    # Get extractor
+    extractor = get_extractor(task_name)
+    log.info("Using extractor", extra={"extractor": extractor.__class__.__name__})
+    # Get evaluator_name from extractor
+    evaluator_name = getattr(extractor, 'evaluator_name', None)
+    # HuggingFace extractor - load directly
+    if isinstance(extractor, HuggingFaceBenchmarkExtractor):
+        log.info("HuggingFace task - loading directly")
+        pairs = extractor.extract_contrastive_pairs(limit=max_items)
+        return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
+    # lm-eval extractor - need to load task
+    log.info("lm-eval task - loading via LMEvalDataLoader")
+    from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
+    loader = LMEvalDataLoader()
+    try:
+        task_obj = loader.load_lm_eval_task(task_name)
+    except Exception as e:
+        log.error(f"Failed to load lm-eval task: {e}")
+        raise
+    # Single task (ConfigurableTask)
+    from lm_eval.api.task import ConfigurableTask
+    if isinstance(task_obj, ConfigurableTask):
+        log.info("Single task")
+        pairs = extractor.extract_contrastive_pairs(task_obj, limit=max_items)
+        return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
+    # Group task (dict) - flatten and sample from all subtasks
+    if isinstance(task_obj, dict):
+        leaf_tasks = _flatten_task_dict(task_obj)
+        log.info(f"Group task with {len(leaf_tasks)} leaf subtasks")
+        if not leaf_tasks:
+            log.warning("No leaf tasks found in group")
+            return []
+        # Shuffle to get random sampling across subtasks
+        random.shuffle(leaf_tasks)
+        # Calculate pairs per subtask
+        if max_items is None:
+            pairs_per_task = None
+        else:
+            # Distribute limit across subtasks, minimum 1 per task
+            pairs_per_task = max(1, max_items // len(leaf_tasks))
+        all_pairs = []
+        for subtask_name, subtask in leaf_tasks:
+            try:
+                # Get the leaf task name (last part after /)
+                leaf_name = subtask_name.split("/")[-1] if "/" in subtask_name else subtask_name
+                # Try to get extractor for the specific subtask first
+                try:
+                    subtask_extractor = get_extractor(leaf_name)
+                except:
+                    # Fall back to parent extractor
+                    subtask_extractor = extractor
+                subtask_evaluator = getattr(subtask_extractor, 'evaluator_name', evaluator_name)
+                subtask_pairs = subtask_extractor.extract_contrastive_pairs(subtask, limit=pairs_per_task)
+                subtask_pairs = _add_evaluator_to_pairs(subtask_pairs, subtask_evaluator, subtask_name)
+                all_pairs.extend(subtask_pairs)
+                # Stop if we have enough
+                if max_items is not None and len(all_pairs) >= max_items:
+                    break
+            except Exception as e:
+                log.warning(f"Failed to extract from subtask {subtask_name}: {e}")
+                continue
+        # Shuffle final result and trim to limit
+        random.shuffle(all_pairs)
+        if max_items is not None:
+            all_pairs = all_pairs[:max_items]
+        log.info(f"Extracted {len(all_pairs)} pairs from group task")
+        return all_pairs
+    log.error(f"Unexpected task_obj type: {type(task_obj)}")
+    return []
 def lm_build_contrastive_pairs(
     task_name: str,
-    lm_eval_task: ConfigurableTask | None,
+    lm_eval_task: "ConfigurableTask | None",
     limit: int | None = None,
-) -> list[ContrastivePair]:
+) -> list["ContrastivePair"]:
     """
-    Resolve the task's extractor (lazy-loaded) and return contrastive pairs.
+    Legacy function - resolve the task's extractor and return contrastive pairs.
+    For new code, prefer using build_contrastive_pairs() which handles
+    task loading automatically.
     arguments:
         task_name:
@@ -47,10 +209,15 @@ def lm_build_contrastive_pairs(
     max_items = None if (limit is None or limit <= 0) else int(limit)
     log.info("Extracting contrastive pairs", extra={"max_items": max_items})
+    # Get evaluator_name from extractor
+    evaluator_name = getattr(extractor, 'evaluator_name', None)
     # 3) Delegate: extractor loads docs and builds pairs
     # HuggingFace extractors don't need lm_eval_task - they load data directly from HuggingFace
     if isinstance(extractor, HuggingFaceBenchmarkExtractor):
-        return extractor.extract_contrastive_pairs(limit=max_items)
+        pairs = extractor.extract_contrastive_pairs(limit=max_items)
     else:
-        return extractor.extract_contrastive_pairs(lm_eval_task, limit=max_items)
+        pairs = extractor.extract_contrastive_pairs(lm_eval_task, limit=max_items)
+    return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)

wisent/core/data_loaders/loaders/lm_loader.py CHANGED Viewed

@@ -10,6 +10,10 @@ os.environ['TF_NUM_INTEROP_THREADS'] = '1'
 os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
 os.environ['OMP_NUM_THREADS'] = '1'
+# Allow code evaluation for code-related tasks (humaneval, etc.)
+# Required by HuggingFace evaluate library for code_eval metric
+os.environ['HF_ALLOW_CODE_EVAL'] = '1'
 # Enable trust_remote_code for all datasets (required for meddialog and others)
 # This uses lm-eval's recommended approach from PR #1998
 import datasets.config
@@ -294,6 +298,8 @@ class LMEvalDataLoader(BaseDataLoader):
             "tinytruthfulqa": "tinyTruthfulQA",
             "tinywinogrande": "tinyWinogrande",
             "paws-x": "pawsx",
+            # afrobench subtasks
+            "afrobench_adr": "adr",
         }
         # Use mapped name if available, otherwise use original
@@ -302,7 +308,9 @@ class LMEvalDataLoader(BaseDataLoader):
             log.info(f"Mapping task '{task_name}' to lm-eval task '{lm_eval_task_name}'")
         # Tasks that require case-sensitive names (don't lowercase these)
-        case_sensitive_prefixes = {"tinyBenchmarks"}
+        # AraDiCE tasks have mixed case (e.g., AraDiCE_ArabicMMLU_lev)
+        # aexams tasks have mixed case (e.g., aexams_IslamicStudies)
+        case_sensitive_prefixes = {"tinyBenchmarks", "AraDiCE", "aexams_"}
         # Normalize task name to lowercase for lm-eval-harness compatibility
         # Many lm-eval tasks use lowercase names (e.g., "aradice" not "AraDICE")
@@ -379,6 +387,9 @@ class LMEvalDataLoader(BaseDataLoader):
             "noreval": ["ask_gec_p0", "ask_gec_p1", "ask_gec_p2", "ask_gec_p3", "ask_gec_p4", "ncb", "norbelebele_p0", "norbelebele_p1", "norbelebele_p2", "norbelebele_p3", "norbelebele_p4", "norcommonsenseqa_nno_p0", "norcommonsenseqa_nno_p1", "norcommonsenseqa_nno_p2", "norcommonsenseqa_nno_p3", "norcommonsenseqa_nno_p4", "norcommonsenseqa_nob_p0", "norcommonsenseqa_nob_p1", "norcommonsenseqa_nob_p2", "norcommonsenseqa_nob_p3", "norcommonsenseqa_nob_p4", "norec_document_p0", "norec_document_p1", "norec_document_p2", "norec_document_p3", "norec_document_p4", "norec_sentence_p0", "norec_sentence_p1", "norec_sentence_p2", "norec_sentence_p3", "norec_sentence_p4", "noridiom_nno_p0", "noridiom_nno_p1", "noridiom_nno_p2", "noridiom_nno_p3", "noridiom_nno_p4", "noridiom_nob_p0", "noridiom_nob_p1", "noridiom_nob_p2", "noridiom_nob_p3", "noridiom_nob_p4", "noropenbookqa_nno_p0", "noropenbookqa_nno_p1", "noropenbookqa_nno_p2", "noropenbookqa_nno_p3", "noropenbookqa_nno_p4", "noropenbookqa_nob_p0", "noropenbookqa_nob_p1", "noropenbookqa_nob_p2", "noropenbookqa_nob_p3", "noropenbookqa_nob_p4", "norquad_p0", "norquad_p1", "norquad_p2", "norquad_p3", "norquad_p4", "norrewrite_instruct", "norsumm_nno_p0", "norsumm_nno_p1", "norsumm_nno_p2", "norsumm_nno_p3", "norsumm_nno_p4", "norsumm_nno_p5", "norsumm_nob_p0", "norsumm_nob_p1", "norsumm_nob_p2", "norsumm_nob_p3", "norsumm_nob_p4", "norsumm_nob_p5", "norsummarize_instruct", "nortruthfulqa_gen_nno_p0", "nortruthfulqa_gen_nno_p1", "nortruthfulqa_gen_nno_p2", "nortruthfulqa_gen_nno_p3", "nortruthfulqa_gen_nno_p4", "nortruthfulqa_gen_nob_p0", "nortruthfulqa_gen_nob_p1", "nortruthfulqa_gen_nob_p2", "nortruthfulqa_gen_nob_p3", "nortruthfulqa_gen_nob_p4", "nortruthfulqa_mc_nno_p0", "nortruthfulqa_mc_nno_p1", "nortruthfulqa_mc_nno_p2", "nortruthfulqa_mc_nno_p3", "nortruthfulqa_mc_nno_p4", "nortruthfulqa_mc_nob_p0", "nortruthfulqa_mc_nob_p1", "nortruthfulqa_mc_nob_p2", "nortruthfulqa_mc_nob_p3", "nortruthfulqa_mc_nob_p4", "nrk_quiz_qa_nno_p0", "nrk_quiz_qa_nno_p1", "nrk_quiz_qa_nno_p2", "nrk_quiz_qa_nno_p3", "nrk_quiz_qa_nno_p4", "nrk_quiz_qa_nob_p0", "nrk_quiz_qa_nob_p1", "nrk_quiz_qa_nob_p2", "nrk_quiz_qa_nob_p3", "nrk_quiz_qa_nob_p4", "tatoeba_eng_nno_p0", "tatoeba_eng_nno_p1", "tatoeba_eng_nno_p2", "tatoeba_eng_nno_p3", "tatoeba_eng_nob_p0", "tatoeba_eng_nob_p1", "tatoeba_eng_nob_p2", "tatoeba_eng_nob_p3", "tatoeba_nno_eng_p0", "tatoeba_nno_eng_p1", "tatoeba_nno_eng_p2", "tatoeba_nno_eng_p3", "tatoeba_nob_eng_p0", "tatoeba_nob_eng_p1", "tatoeba_nob_eng_p2", "tatoeba_nob_eng_p3"],
             "storycloze": ["xstorycloze_en"],
             "instructhumaneval": ["humaneval_instruct"],
+            # African language benchmarks
+            "afrimgsm": ["afrimgsm_amh_prompt_1", "afrimgsm_eng_prompt_1", "afrimgsm_fra_prompt_1", "afrimgsm_hau_prompt_1", "afrimgsm_ibo_prompt_1", "afrimgsm_kin_prompt_1", "afrimgsm_swa_prompt_1", "afrimgsm_yor_prompt_1"],
+            "afrimmlu": ["afrimmlu_direct_amh_prompt_1", "afrimmlu_direct_eng_prompt_1", "afrimmlu_direct_fra_prompt_1", "afrimmlu_direct_hau_prompt_1", "afrimmlu_direct_ibo_prompt_1", "afrimmlu_direct_kin_prompt_1", "afrimmlu_direct_swa_prompt_1", "afrimmlu_direct_yor_prompt_1"],
         }
         # Check if task is explicitly disabled

wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.701py3-none-any.whl → 0.7.901py3-none-any.whl