PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py ADDED Viewed

@@ -0,0 +1,105 @@
+from __future__ import annotations
+from typing import Any, TYPE_CHECKING
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
+from wisent.core.cli_logger import setup_logger, bind
+if TYPE_CHECKING:
+    from lm_eval.api.task import ConfigurableTask
+__all__ = ["NoticiaExtractor"]
+_LOG = setup_logger(__name__)
+task_names = ("noticia",)
+class NoticiaExtractor(LMEvalBenchmarkExtractor):
+    """Extractor for Noticia benchmark."""
+    evaluator_name = "generation"
+    def extract_contrastive_pairs(
+        self,
+        lm_eval_task_data: ConfigurableTask,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
+        max_items = self._normalize_limit(limit)
+        docs = self.load_docs(lm_eval_task_data, max_items)
+        pairs: list[ContrastivePair] = []
+        log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
+        for doc in docs:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
+            log.warning("No valid pairs extracted", extra={"task": task_name})
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single Noticia doc into a ContrastivePair, if possible.
+        Returns None when required fields are missing or malformed.
+        Noticia format (Spanish clickbait summarization):
+        - web_headline: the sensationalist/clickbait headline
+        - web_text: the actual news article body
+        - summary: the target one-sentence summary revealing the truth
+        """
+        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
+        try:
+            # Noticia format
+            if "web_headline" in doc and "web_text" in doc and "summary" in doc:
+                headline = str(doc.get("web_headline", "")).strip()
+                text = str(doc.get("web_text", "")).strip()
+                summary = str(doc.get("summary", "")).strip()
+                if not headline or not text or not summary:
+                    log.debug("Skipping doc with missing headline/text/summary", extra={"doc": doc})
+                    return None
+                # Prompt is the instruction + headline + text (as in lm-eval)
+                prompt = f"Ahora eres una Inteligencia Artificial experta en desmontar titulares sensacionalistas o clickbait. Tu tarea consiste en analizar noticias con titulares sensacionalistas y generar un resumen de una sola frase que revele la verdad detrás del titular.\nEste es el titular de la noticia: {headline}\nEl titular plantea una pregunta o proporciona información incompleta. Debes buscar en el cuerpo de la noticia una frase que responda lo que se sugiere en el título. Siempre que puedas cita el texto original, especialmente si se trata de una frase que alguien ha dicho. Si citas una frase que alguien ha dicho, usa comillas para indicar que es una cita. Usa siempre las mínimas palabras posibles. No es necesario que la respuesta sea una oración completa, puede ser sólo el foco de la pregunta. Recuerda responder siempre en Español.\nEste es el cuerpo de la noticia:\n{text}"
+                # Positive: the actual summary
+                correct = summary
+                # Negative: generic refusal (similar to other summarization tasks)
+                incorrect = "No puedo proporcionar un resumen de esta noticia."
+                metadata = {"label": "noticia"}
+                return self._build_pair(
+                    question=prompt,
+                    correct=correct,
+                    incorrect=incorrect,
+                    metadata=metadata,
+                )
+            log.debug("Skipping doc without web_headline/web_text/summary fields", extra={"doc": doc})
+            return None
+        except Exception as exc:
+            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
+            return None
+    @staticmethod
+    def _build_pair(
+        question: str,
+        correct: str,
+        incorrect: str,
+        metadata: dict[str, Any] | None = None,
+    ) -> ContrastivePair:
+        positive_response = PositiveResponse(model_response=correct)
+        negative_response = NegativeResponse(model_response=incorrect)
+        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py ADDED Viewed

@@ -0,0 +1,135 @@
+from __future__ import annotations
+import random
+from typing import Any, TYPE_CHECKING
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
+from wisent.core.cli_logger import setup_logger, bind
+if TYPE_CHECKING:
+    from lm_eval.api.task import ConfigurableTask
+__all__ = ["NQOpenExtractor"]
+_LOG = setup_logger(__name__)
+task_names = ("nq_open",)
+class NQOpenExtractor(LMEvalBenchmarkExtractor):
+    """Extractor for the Natural Questions Open benchmark."""
+    evaluator_name = "exact_match"
+    def extract_contrastive_pairs(
+        self,
+        lm_eval_task_data: ConfigurableTask,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from Natural Questions docs.
+        Natural Questions schema:
+            - question: str
+            - answer: list[str] (multiple acceptable answers)
+        Args:
+            lm_eval_task_data: lm-eval task instance for NQ Open.
+            limit: Optional maximum number of pairs to produce.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
+        max_items = self._normalize_limit(limit)
+        docs = self.load_docs(lm_eval_task_data, max_items)
+        pairs: list[ContrastivePair] = []
+        log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
+        for doc in docs:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
+            log.warning("No valid NQ Open pairs extracted", extra={"task": task_name})
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single Natural Questions doc into a ContrastivePair, if possible.
+        Returns None when required fields are missing or malformed.
+        """
+        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
+        try:
+            question = str(doc.get("question", "")).strip()
+            answer = doc.get("answer", [])
+            if not question or not answer:
+                log.debug(
+                    "Skipping doc due to missing/invalid fields",
+                    extra={"doc": doc},
+                )
+                return None
+            # Natural Questions has multiple acceptable answers, use the first one
+            if isinstance(answer, list):
+                correct = answer[0]
+            else:
+                correct = str(answer)
+            # Generate incorrect answer
+            incorrect = None
+            try:
+                # Try to convert to number
+                num = float(correct)
+                # Check if it's an integer
+                if num.is_integer():
+                    incorrect = str(int(num) + 1)
+                else:
+                    incorrect = str(num + 1)
+            except ValueError:
+                # It's a string, shuffle the letters until different
+                letters = list(correct)
+                incorrect = correct
+                random.shuffle(letters)
+                incorrect = ''.join(letters)
+                if incorrect == correct:
+                    incorrect += "k"
+            formatted_question = f"Question: {question}\nAnswer:\nA. {incorrect}\nB. {correct}"
+            metadata = {
+                "label": "nq_open",
+            }
+            return self._build_pair(
+                question=formatted_question,
+                correct=correct,
+                incorrect=incorrect,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
+            return None
+    @staticmethod
+    def _build_pair(
+        question: str,
+        correct: str,
+        incorrect: str,
+        metadata: dict[str, Any] | None = None,
+    ) -> ContrastivePair:
+        positive_response = PositiveResponse(model_response=correct)
+        negative_response = NegativeResponse(model_response=incorrect)
+        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_extractors.super_glue import SuperGlueExtractor
+if TYPE_CHECKING:
+    pass
+__all__ = ["OkapiExtractor"]
+# All okapi tasks are now managed by their specific extractors:
+# - arc_* tasks: OkapiArcMultilingualExtractor
+# - hellaswag_* tasks: OkapiHellaswagMultilingualExtractor
+# - m_mmlu_* tasks: OkapiMmluMultilingualExtractor
+# - truthfulqa_* tasks: OkapiTruthfulqaMultilingualExtractor
+task_names = ()
+class OkapiExtractor(SuperGlueExtractor):
+    """Extractor for Okapi multilingual benchmarks (arc, hellaswag, mmlu, truthfulqa).
+    Okapi benchmarks use the same multiple-choice format as SuperGlue, so we inherit
+    the extraction logic directly from SuperGlueExtractor.
+    """
+    evaluator_name = "log_likelihoods"
+    pass

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py ADDED Viewed

@@ -0,0 +1,167 @@
+from __future__ import annotations
+from typing import Any, TYPE_CHECKING
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
+from wisent.core.cli_logger import setup_logger, bind
+if TYPE_CHECKING:
+    from lm_eval.api.task import ConfigurableTask
+__all__ = ["OkapiArcMultilingualExtractor"]
+_LOG = setup_logger(__name__)
+task_names = (
+    "arc_ar", "arc_bn", "arc_ca", "arc_da", "arc_de", "arc_es", "arc_eu", "arc_fr",
+    "arc_gu", "arc_hi", "arc_hr", "arc_hu", "arc_hy", "arc_id", "arc_it", "arc_kn",
+    "arc_ml", "arc_mr", "arc_ne", "arc_nl", "arc_pt", "arc_ro", "arc_ru", "arc_sk",
+    "arc_sr", "arc_sv", "arc_ta", "arc_te", "arc_uk", "arc_vi", "arc_zh"
+)
+class OkapiArcMultilingualExtractor(LMEvalBenchmarkExtractor):
+    """Extractor for the Okapi/Arc Multilingual benchmark."""
+    evaluator_name = "log_likelihoods"
+    def extract_contrastive_pairs(
+        self,
+        lm_eval_task_data: ConfigurableTask,
+        limit: int | None = None,
+        preferred_doc: str | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from Okapi/Arc Multilingual docs.
+        Args:
+            lm_eval_task_data: lm-eval task instance for Okapi/Arc Multilingual.
+            limit: Optional maximum number of pairs to produce.
+            preferred_doc: Optional preferred document source.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
+        max_items = self._normalize_limit(limit)
+        docs = self.load_docs(lm_eval_task_data, max_items, preferred_doc=preferred_doc)
+        pairs: list[ContrastivePair] = []
+        log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
+        for doc in docs:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
+            log.warning("No valid Okapi/Arc Multilingual pairs extracted", extra={"task": task_name})
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single Okapi/Arc Multilingual doc into a ContrastivePair, if possible.
+        Returns None when required fields are missing or malformed.
+        """
+        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
+        try:
+            # Try multiple possible schema formats
+            question = None
+            choices = None
+            answer_idx = None
+            # Format 1: question + choices + answer
+            if "question" in doc and "choices" in doc:
+                question = str(doc.get("question", "")).strip()
+                choices_data = doc.get("choices", {})
+                if isinstance(choices_data, dict):
+                    choices = choices_data.get("text", [])
+                elif isinstance(choices_data, list):
+                    choices = choices_data
+                answer = doc.get("answer", doc.get("answerKey", ""))
+                if isinstance(answer, str) and len(answer) == 1 and answer.isalpha():
+                    answer_idx = ord(answer.upper()) - ord('A')
+                else:
+                    answer_idx = int(answer) if answer else 0
+            # Format 2: instruction + option_a/b/c/d + answer (MMMLU style)
+            elif "instruction" in doc and "option_a" in doc:
+                question = str(doc.get("instruction", "")).strip()
+                choices = [
+                    str(doc.get("option_a", "")).strip(),
+                    str(doc.get("option_b", "")).strip(),
+                    str(doc.get("option_c", "")).strip(),
+                    str(doc.get("option_d", "")).strip(),
+                ]
+                choices = [c for c in choices if c]
+                answer = doc.get("answer", "A")
+                answer_idx = ord(str(answer).upper()) - ord('A')
+            # Format 3: Okapi format (query + gold + choices)
+            elif "query" in doc and "gold" in doc and "choices" in doc:
+                question = str(doc.get("query", "")).strip()
+                choices = doc.get("choices", [])
+                answer_idx = int(doc.get("gold", 0))
+                # choices is a list, gold is the index
+            # Format 4: query/prompt + answer (open-ended)
+            elif "query" in doc or "prompt" in doc:
+                question = str(doc.get("query", doc.get("prompt", ""))).strip()
+                # For open-ended questions, use target as correct answer
+                correct_answer = str(doc.get("target", doc.get("answer", ""))).strip()
+                if correct_answer:
+                    metadata = {"label": "okapi/arc_multilingual"}
+                    return self._build_pair(
+                        question=f"Question: {question}",
+                        correct=correct_answer,
+                        incorrect="incorrect answer",
+                        metadata=metadata,
+                    )
+                return None
+            if not question or not choices or answer_idx is None or not (0 <= answer_idx < len(choices)):
+                log.debug(
+                    "Skipping doc due to missing/invalid fields",
+                    extra={"doc": doc},
+                )
+                return None
+            correct = choices[answer_idx]
+            incorrect_idx = (answer_idx + 1) % len(choices)
+            incorrect = choices[incorrect_idx]
+            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
+            metadata = {
+                "label": "okapi/arc_multilingual",
+            }
+            return self._build_pair(
+                question=formatted_question,
+                correct=correct,
+                incorrect=incorrect,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
+            return None
+    @staticmethod
+    def _build_pair(
+        question: str,
+        correct: str,
+        incorrect: str,
+        metadata: dict[str, Any] | None = None,
+    ) -> ContrastivePair:
+        positive_response = PositiveResponse(model_response=correct)
+        negative_response = NegativeResponse(model_response=incorrect)
+        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))

wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py ADDED Viewed

@@ -0,0 +1,174 @@
+from __future__ import annotations
+from typing import Any, TYPE_CHECKING
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.response import NegativeResponse, PositiveResponse
+from wisent.core.contrastive_pairs.lm_eval_pairs.atoms import LMEvalBenchmarkExtractor
+from wisent.core.cli_logger import setup_logger, bind
+if TYPE_CHECKING:
+    from lm_eval.api.task import ConfigurableTask
+__all__ = ["OkapiHellaswagMultilingualExtractor"]
+_LOG = setup_logger(__name__)
+task_names = (
+    "hellaswag_ar", "hellaswag_bn", "hellaswag_ca", "hellaswag_da", "hellaswag_de",
+    "hellaswag_es", "hellaswag_eu", "hellaswag_fr", "hellaswag_gu", "hellaswag_hi",
+    "hellaswag_hr", "hellaswag_hu", "hellaswag_hy", "hellaswag_id", "hellaswag_it",
+    "hellaswag_kn", "hellaswag_ml", "hellaswag_mr", "hellaswag_ne", "hellaswag_nl",
+    "hellaswag_pt", "hellaswag_ro", "hellaswag_ru", "hellaswag_sk", "hellaswag_sr",
+    "hellaswag_sv", "hellaswag_ta", "hellaswag_te", "hellaswag_uk", "hellaswag_vi"
+)
+class OkapiHellaswagMultilingualExtractor(LMEvalBenchmarkExtractor):
+    """Extractor for the Okapi/Hellaswag Multilingual benchmark."""
+    evaluator_name = "log_likelihoods"
+    def extract_contrastive_pairs(
+        self,
+        lm_eval_task_data: ConfigurableTask,
+        limit: int | None = None,
+        preferred_doc: str | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from Okapi/Hellaswag Multilingual docs.
+        Args:
+            lm_eval_task_data: lm-eval task instance for Okapi/Hellaswag Multilingual.
+            limit: Optional maximum number of pairs to produce.
+            preferred_doc: Optional preferred document source.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        log = bind(_LOG, task=getattr(lm_eval_task_data, "NAME", "unknown"))
+        max_items = self._normalize_limit(limit)
+        docs = self.load_docs(lm_eval_task_data, max_items, preferred_doc=preferred_doc)
+        pairs: list[ContrastivePair] = []
+        log.info("Extracting contrastive pairs", extra={"doc_count": len(docs)})
+        for doc in docs:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            task_name = getattr(lm_eval_task_data, "NAME", type(lm_eval_task_data).__name__)
+            log.warning("No valid Okapi/Hellaswag Multilingual pairs extracted", extra={"task": task_name})
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single Okapi/Hellaswag Multilingual doc into a ContrastivePair, if possible.
+        Returns None when required fields are missing or malformed.
+        """
+        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
+        try:
+            # Try multiple possible schema formats
+            question = None
+            choices = None
+            answer_idx = None
+            # Format 1: query + choices + gold/label (Hellaswag style)
+            if ("query" in doc or "prompt" in doc) and "choices" in doc:
+                question = str(doc.get("query", doc.get("prompt", ""))).strip()
+                choices = doc.get("choices", [])
+                if not isinstance(choices, list):
+                    choices = []
+                # Try gold, label, or answer field
+                answer = doc.get("gold", doc.get("label", doc.get("answer", "")))
+                if isinstance(answer, str):
+                    if len(answer) == 1 and answer.isalpha():
+                        answer_idx = ord(answer.upper()) - ord('A')
+                    else:
+                        try:
+                            answer_idx = int(answer)
+                        except (ValueError, TypeError):
+                            answer_idx = 0
+                else:
+                    answer_idx = int(answer) if answer else 0
+            # Format 2: question + choices + answer
+            elif "question" in doc and "choices" in doc:
+                question = str(doc.get("question", "")).strip()
+                choices_data = doc.get("choices", {})
+                if isinstance(choices_data, dict):
+                    choices = choices_data.get("text", [])
+                elif isinstance(choices_data, list):
+                    choices = choices_data
+                answer = doc.get("answer", doc.get("answerKey", ""))
+                if isinstance(answer, str) and len(answer) == 1 and answer.isalpha():
+                    answer_idx = ord(answer.upper()) - ord('A')
+                else:
+                    answer_idx = int(answer) if answer else 0
+            # Format 3: instruction + option_a/b/c/d + answer (MMMLU style)
+            elif "instruction" in doc and "option_a" in doc:
+                question = str(doc.get("instruction", "")).strip()
+                choices = [
+                    str(doc.get("option_a", "")).strip(),
+                    str(doc.get("option_b", "")).strip(),
+                    str(doc.get("option_c", "")).strip(),
+                    str(doc.get("option_d", "")).strip(),
+                ]
+                choices = [c for c in choices if c]
+                answer = doc.get("answer", "A")
+                answer_idx = ord(str(answer).upper()) - ord('A')
+            if not question or not choices or answer_idx is None or not (0 <= answer_idx < len(choices)):
+                log.debug(
+                    "Skipping doc due to missing/invalid fields",
+                    extra={"doc": doc},
+                )
+                return None
+            correct = choices[answer_idx].strip() if isinstance(choices[answer_idx], str) else str(choices[answer_idx])
+            incorrect_idx = (answer_idx + 1) % len(choices)
+            incorrect = choices[incorrect_idx].strip() if isinstance(choices[incorrect_idx], str) else str(choices[incorrect_idx])
+            # Validate that both correct and incorrect are non-empty
+            if not correct or not incorrect:
+                log.debug(
+                    "Skipping doc due to empty correct or incorrect answer",
+                    extra={"doc": doc, "correct": correct, "incorrect": incorrect},
+                )
+                return None
+            formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
+            metadata = {
+                "label": "okapi/hellaswag_multilingual",
+            }
+            return self._build_pair(
+                question=formatted_question,
+                correct=correct,
+                incorrect=incorrect,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
+            return None
+    @staticmethod
+    def _build_pair(
+        question: str,
+        correct: str,
+        incorrect: str,
+        metadata: dict[str, Any] | None = None,
+    ) -> ContrastivePair:
+        positive_response = PositiveResponse(model_response=correct)
+        negative_response = NegativeResponse(model_response=incorrect)
+        return ContrastivePair(prompt=question, positive_response=positive_response, negative_response=negative_response, label=metadata.get("label"))