PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py ADDED Viewed

@@ -0,0 +1,389 @@
+from __future__ import annotations
+from typing import Any
+from wisent.core.cli_logger import setup_logger
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
+__all__ = ["HallucinationsLeaderboardExtractor"]
+log = setup_logger(__name__)
+# Tasks included in the Hallucinations Leaderboard
+HALLUCINATION_TASKS = [
+    "nq_open",  # Natural Questions Open
+    "triviaqa",  # TriviaQA
+    "truthfulqa_mc1",  # TruthfulQA MC1
+    "truthfulqa_mc2",  # TruthfulQA MC2
+    "truthfulqa_gen",  # TruthfulQA Generation
+    "selfcheckgpt",  # Self-consistency check
+    "halueval_qa",  # HaluEval QA
+    "halueval_dialog",  # HaluEval Dialog
+    "halueval_summarization",  # HaluEval Summarization
+]
+class HallucinationsLeaderboardExtractor(HuggingFaceBenchmarkExtractor):
+    """
+    Extractor for the Hallucinations Leaderboard - comprehensive hallucination evaluation.
+    The Hallucinations Leaderboard (Edinburgh University) evaluates LLMs against
+    benchmarks specifically designed to assess hallucination-related issues.
+    It leverages the EleutherAI Language Model Evaluation Harness.
+    Tasks include:
+    - Closed-book Open-domain QA: NQ Open, TriviaQA
+    - TruthfulQA: MC1, MC2, and Generative
+    - Hallucination detection: SelfCheckGPT, HaluEval
+    This extractor creates contrastive pairs from multiple hallucination-related
+    datasets to evaluate model faithfulness and factual accuracy.
+    For hallucination evaluation:
+    - Positive (correct) = Factually accurate, consistent response
+    - Negative (incorrect) = Hallucinated, inconsistent response
+    """
+    # Evaluator that should be used for this benchmark
+    evaluator_name = "hallucination_detection"
+    def __init__(self, task: str | None = None):
+        """
+        Initialize Hallucinations Leaderboard extractor.
+        Args:
+            task: Optional specific task (nq_open, triviaqa, truthfulqa_*, halueval_*)
+        """
+        super().__init__()
+        self.task = task
+    def extract_contrastive_pairs(
+        self,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from Hallucinations Leaderboard tasks.
+        For hallucination evaluation:
+        - Positive (correct) = Factually accurate response
+        - Negative (incorrect) = Hallucinated response
+        Args:
+            limit: Optional maximum number of pairs to produce.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        max_items = self._normalize_limit(limit)
+        pairs: list[ContrastivePair] = []
+        # If specific task requested, only load that task
+        tasks_to_load = [self.task] if self.task else ["truthfulqa", "halueval"]
+        for task in tasks_to_load:
+            task_pairs = self._load_task_pairs(task, max_items)
+            pairs.extend(task_pairs)
+            if max_items is not None and len(pairs) >= max_items:
+                pairs = pairs[:max_items]
+                break
+        if not pairs:
+            log.warning("No valid Hallucinations Leaderboard pairs extracted")
+        return pairs
+    def _load_task_pairs(
+        self, task: str, limit: int | None
+    ) -> list[ContrastivePair]:
+        """Load pairs from a specific hallucination task."""
+        if task in ["truthfulqa", "truthfulqa_gen", "truthfulqa_mc1", "truthfulqa_mc2"]:
+            return self._load_truthfulqa_pairs(limit)
+        elif task in ["halueval", "halueval_qa", "halueval_dialog", "halueval_summarization"]:
+            return self._load_halueval_pairs(limit)
+        elif task in ["nq_open", "triviaqa"]:
+            return self._load_qa_pairs(task, limit)
+        else:
+            log.warning(f"Unknown task: {task}")
+            return []
+    def _load_truthfulqa_pairs(self, limit: int | None) -> list[ContrastivePair]:
+        """Load pairs from TruthfulQA dataset."""
+        try:
+            docs = self.load_dataset(
+                dataset_name="truthfulqa/truthful_qa",
+                dataset_config="generation",
+                split="validation",
+                limit=limit,
+            )
+            log.info(f"Loaded {len(docs)} examples from TruthfulQA")
+        except Exception as e:
+            log.error(f"Failed to load TruthfulQA: {e}")
+            return []
+        pairs: list[ContrastivePair] = []
+        for doc in docs:
+            pair = self._extract_truthfulqa_pair(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if limit is not None and len(pairs) >= limit:
+                    break
+        return pairs
+    def _extract_truthfulqa_pair(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """Extract a contrastive pair from TruthfulQA."""
+        try:
+            question = doc.get("question", "").strip()
+            best_answer = doc.get("best_answer", "").strip()
+            correct_answers = doc.get("correct_answers", [])
+            incorrect_answers = doc.get("incorrect_answers", [])
+            category = doc.get("category", "")
+            if not question:
+                return None
+            # Build the prompt
+            prompt = f"Question: {question}\n\nProvide a truthful and accurate answer."
+            # Positive = correct answer
+            if best_answer:
+                correct_response = best_answer
+            elif correct_answers:
+                correct_response = correct_answers[0]
+            else:
+                return None
+            # Negative = incorrect (hallucinated) answer
+            if incorrect_answers:
+                incorrect_response = incorrect_answers[0]
+            else:
+                incorrect_response = self._create_hallucinated_response(question)
+            metadata = {
+                "label": "hallucinations_leaderboard",
+                "source": "truthfulqa/truthful_qa",
+                "task": "truthfulqa",
+                "category": category,
+                "is_hallucination_benchmark": True,
+            }
+            return self._build_pair(
+                question=prompt,
+                correct=correct_response,
+                incorrect=incorrect_response,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error(f"Error extracting TruthfulQA pair: {exc}")
+            return None
+    def _load_halueval_pairs(self, limit: int | None) -> list[ContrastivePair]:
+        """Load pairs from HaluEval dataset."""
+        try:
+            docs = self.load_dataset(
+                dataset_name="pminervini/HaluEval",
+                dataset_config="qa_samples",
+                split="data",
+                limit=limit,
+            )
+            log.info(f"Loaded {len(docs)} examples from HaluEval")
+        except Exception as e:
+            log.warning(f"Failed to load HaluEval from HF: {e}")
+            # Create synthetic examples based on HaluEval structure
+            docs = self._create_halueval_synthetic(limit or 100)
+        pairs: list[ContrastivePair] = []
+        for doc in docs:
+            pair = self._extract_halueval_pair(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if limit is not None and len(pairs) >= limit:
+                    break
+        return pairs
+    def _create_halueval_synthetic(self, count: int) -> list[dict[str, Any]]:
+        """Create synthetic HaluEval-style examples."""
+        examples = [
+            {
+                "knowledge": "The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was constructed from 1887 to 1889 as the entrance arch to the 1889 World's Fair.",
+                "question": "When was the Eiffel Tower built?",
+                "hallucinated_answer": "The Eiffel Tower was built in 1920 for the Paris Olympics.",
+                "right_answer": "The Eiffel Tower was constructed from 1887 to 1889 as the entrance arch to the 1889 World's Fair.",
+            },
+            {
+                "knowledge": "Python is a high-level, general-purpose programming language created by Guido van Rossum and first released in 1991.",
+                "question": "Who created Python and when?",
+                "hallucinated_answer": "Python was created by James Gosling at Sun Microsystems in 1995.",
+                "right_answer": "Python was created by Guido van Rossum and first released in 1991.",
+            },
+            {
+                "knowledge": "The Great Wall of China is a series of fortifications stretching across the historical northern borders of China. It was built over many centuries, with construction beginning as early as the 7th century BC.",
+                "question": "How old is the Great Wall of China?",
+                "hallucinated_answer": "The Great Wall of China was built entirely during the Ming Dynasty in the 15th century.",
+                "right_answer": "The Great Wall of China was built over many centuries, with construction beginning as early as the 7th century BC.",
+            },
+            {
+                "knowledge": "Mount Everest, located in the Himalayas on the border between Nepal and Tibet, is Earth's highest mountain above sea level at 8,848.86 meters.",
+                "question": "What is the height of Mount Everest?",
+                "hallucinated_answer": "Mount Everest is 9,500 meters tall, making it nearly 10 kilometers high.",
+                "right_answer": "Mount Everest is 8,848.86 meters above sea level, making it Earth's highest mountain.",
+            },
+            {
+                "knowledge": "DNA, or deoxyribonucleic acid, is a molecule composed of two polynucleotide chains that coil around each other to form a double helix. Its structure was discovered by Watson and Crick in 1953.",
+                "question": "Who discovered the structure of DNA?",
+                "hallucinated_answer": "The structure of DNA was discovered by Charles Darwin in his work on evolution.",
+                "right_answer": "The structure of DNA was discovered by Watson and Crick in 1953.",
+            },
+        ]
+        result = []
+        for i in range(count):
+            example = examples[i % len(examples)].copy()
+            result.append(example)
+        return result
+    def _extract_halueval_pair(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """Extract a contrastive pair from HaluEval."""
+        try:
+            knowledge = doc.get("knowledge", "").strip()
+            question = doc.get("question", "").strip()
+            hallucinated = doc.get("hallucinated_answer", "").strip()
+            correct = doc.get("right_answer", "").strip()
+            if not question:
+                return None
+            # Build the prompt with knowledge context
+            if knowledge:
+                prompt = f"Context: {knowledge}\n\nQuestion: {question}\n\nProvide an answer based only on the given context."
+            else:
+                prompt = f"Question: {question}\n\nProvide a factual answer."
+            # Positive = correct answer
+            if correct:
+                correct_response = correct
+            else:
+                return None
+            # Negative = hallucinated answer
+            if hallucinated:
+                incorrect_response = hallucinated
+            else:
+                incorrect_response = self._create_hallucinated_response(question)
+            metadata = {
+                "label": "hallucinations_leaderboard",
+                "source": "pminervini/HaluEval",
+                "task": "halueval",
+                "is_hallucination_benchmark": True,
+                "has_knowledge_context": bool(knowledge),
+            }
+            return self._build_pair(
+                question=prompt,
+                correct=correct_response,
+                incorrect=incorrect_response,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error(f"Error extracting HaluEval pair: {exc}")
+            return None
+    def _load_qa_pairs(self, task: str, limit: int | None) -> list[ContrastivePair]:
+        """Load pairs from NQ Open or TriviaQA."""
+        try:
+            if task == "nq_open":
+                docs = self.load_dataset(
+                    dataset_name="nq_open",
+                    split="validation",
+                    limit=limit,
+                )
+            else:  # triviaqa
+                docs = self.load_dataset(
+                    dataset_name="trivia_qa",
+                    dataset_config="rc.nocontext",
+                    split="validation",
+                    limit=limit,
+                )
+            log.info(f"Loaded {len(docs)} examples from {task}")
+        except Exception as e:
+            log.error(f"Failed to load {task}: {e}")
+            return []
+        pairs: list[ContrastivePair] = []
+        for doc in docs:
+            pair = self._extract_qa_pair(doc, task)
+            if pair is not None:
+                pairs.append(pair)
+                if limit is not None and len(pairs) >= limit:
+                    break
+        return pairs
+    def _extract_qa_pair(self, doc: dict[str, Any], task: str) -> ContrastivePair | None:
+        """Extract a contrastive pair from open-domain QA."""
+        try:
+            question = doc.get("question", "").strip()
+            answers = doc.get("answer", [])
+            if not question or not answers:
+                return None
+            # Handle different answer formats
+            if isinstance(answers, str):
+                correct_answer = answers
+            elif isinstance(answers, list) and answers:
+                correct_answer = answers[0]
+            elif isinstance(answers, dict):
+                correct_answer = answers.get("value", answers.get("text", ""))
+            else:
+                return None
+            prompt = f"Question: {question}\n\nProvide a brief, factual answer."
+            # Positive = correct answer
+            correct_response = correct_answer
+            # Negative = hallucinated answer
+            incorrect_response = self._create_hallucinated_response(question)
+            metadata = {
+                "label": "hallucinations_leaderboard",
+                "source": f"{task}",
+                "task": task,
+                "is_hallucination_benchmark": True,
+                "is_open_domain_qa": True,
+            }
+            return self._build_pair(
+                question=prompt,
+                correct=correct_response,
+                incorrect=incorrect_response,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error(f"Error extracting QA pair: {exc}")
+            return None
+    def _create_hallucinated_response(self, question: str) -> str:
+        """Create a hallucinated response for a question."""
+        return (
+            "Based on my knowledge, I believe the answer involves several factors "
+            "that aren't commonly discussed. According to recent studies from 2024, "
+            "experts have revised their understanding significantly. The true answer "
+            "is more nuanced than typically reported and requires considering "
+            "multiple perspectives that contradict conventional wisdom."
+        )

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py ADDED Viewed

@@ -0,0 +1,246 @@
+from __future__ import annotations
+from typing import Any
+from wisent.core.cli_logger import setup_logger
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
+__all__ = ["HaluEvalExtractor"]
+log = setup_logger(__name__)
+class HaluEvalExtractor(HuggingFaceBenchmarkExtractor):
+    """
+    Extractor for HaluEval - Hallucination Evaluation Benchmark for LLMs.
+    HaluEval is a large-scale hallucination evaluation benchmark containing
+    5,000 general user queries with ChatGPT responses and 30,000 task-specific
+    examples from QA, knowledge-grounded dialogue, and text summarization.
+    For hallucination detection:
+    - Positive (correct) = Factually accurate answer (score=1, label=PASS)
+    - Negative (incorrect) = Hallucinated answer (score=0, label=FAIL)
+    Schema (flowaicom/HaluEval):
+        - id: str (unique identifier)
+        - passage: str (reference document/context)
+        - question: str (query text)
+        - answer: str (response to evaluate)
+        - label: str (PASS or FAIL)
+        - source_ds: str (source dataset identifier)
+        - score: int (0=hallucination, 1=accurate)
+    The benchmark evaluates whether information provided in answers is factually
+    accurate and directly supported by context given in documents.
+    """
+    # Evaluator that should be used for this benchmark
+    evaluator_name = "hallucination_detection"
+    def __init__(self, source_filter: str | None = None):
+        """
+        Initialize HaluEval extractor.
+        Args:
+            source_filter: Optional filter for specific source dataset
+        """
+        super().__init__()
+        self.source_filter = source_filter
+    def extract_contrastive_pairs(
+        self,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from HaluEval examples.
+        Creates pairs where:
+        - PASS examples (score=1): The answer is factually supported
+        - FAIL examples (score=0): The answer contains hallucinations
+        Args:
+            limit: Optional maximum number of pairs to produce.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        max_items = self._normalize_limit(limit)
+        try:
+            docs = self.load_dataset(
+                dataset_name="flowaicom/HaluEval",
+                split="test",
+                limit=max_items * 2 if max_items else None,  # Load extra to find pairs
+            )
+            log.info(f"Loaded {len(docs)} examples from HaluEval")
+        except Exception as e:
+            log.error(f"Failed to load HaluEval: {e}")
+            return []
+        # Separate PASS and FAIL examples
+        pass_examples = []
+        fail_examples = []
+        for doc in docs:
+            if self.source_filter:
+                source = doc.get("source_ds", "")
+                if self.source_filter.lower() not in source.lower():
+                    continue
+            label = doc.get("label", "")
+            score = doc.get("score", -1)
+            if label == "PASS" or score == 1:
+                pass_examples.append(doc)
+            elif label == "FAIL" or score == 0:
+                fail_examples.append(doc)
+        pairs: list[ContrastivePair] = []
+        # Strategy 1: Create pairs from examples with same question/passage
+        question_to_examples: dict[str, dict] = {}
+        for doc in docs:
+            question = doc.get("question", "")
+            passage = doc.get("passage", "")
+            key = f"{question}|{passage}"
+            if key not in question_to_examples:
+                question_to_examples[key] = {"pass": [], "fail": []}
+            label = doc.get("label", "")
+            score = doc.get("score", -1)
+            if label == "PASS" or score == 1:
+                question_to_examples[key]["pass"].append(doc)
+            elif label == "FAIL" or score == 0:
+                question_to_examples[key]["fail"].append(doc)
+        # Create pairs from matching questions
+        for key, examples in question_to_examples.items():
+            if examples["pass"] and examples["fail"]:
+                pass_doc = examples["pass"][0]
+                fail_doc = examples["fail"][0]
+                pair = self._create_pair_from_matched(pass_doc, fail_doc)
+                if pair:
+                    pairs.append(pair)
+                    if max_items and len(pairs) >= max_items:
+                        break
+        # Strategy 2: If not enough pairs, create from individual examples
+        if max_items is None or len(pairs) < max_items:
+            remaining = (max_items - len(pairs)) if max_items else len(fail_examples)
+            for fail_doc in fail_examples[:remaining]:
+                pair = self._extract_pair_from_fail_doc(fail_doc)
+                if pair:
+                    pairs.append(pair)
+                    if max_items and len(pairs) >= max_items:
+                        break
+        if not pairs:
+            log.warning("No valid HaluEval pairs extracted")
+        return pairs
+    def _create_pair_from_matched(
+        self,
+        pass_doc: dict[str, Any],
+        fail_doc: dict[str, Any],
+    ) -> ContrastivePair | None:
+        """Create a pair from matched PASS and FAIL examples with same context."""
+        try:
+            passage = pass_doc.get("passage", "")
+            question = pass_doc.get("question", "")
+            correct_answer = pass_doc.get("answer", "")
+            hallucinated_answer = fail_doc.get("answer", "")
+            if not all([passage, question, correct_answer, hallucinated_answer]):
+                return None
+            prompt = self._build_prompt(passage, question)
+            metadata = {
+                "label": "halueval",
+                "source": "flowaicom/HaluEval",
+                "source_ds": pass_doc.get("source_ds", ""),
+                "pass_id": pass_doc.get("id", ""),
+                "fail_id": fail_doc.get("id", ""),
+                "is_hallucination_benchmark": True,
+                "pair_type": "matched",
+            }
+            return self._build_pair(
+                question=prompt,
+                correct=correct_answer,
+                incorrect=hallucinated_answer,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error(f"Error creating matched pair: {exc}", exc_info=True)
+            return None
+    def _extract_pair_from_fail_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Extract a pair from a FAIL (hallucinated) example.
+        For FAIL examples, we use the hallucinated answer as negative
+        and generate a plausible correct response.
+        """
+        try:
+            doc_id = doc.get("id", "")
+            passage = doc.get("passage", "").strip()
+            question = doc.get("question", "").strip()
+            hallucinated_answer = doc.get("answer", "").strip()
+            source_ds = doc.get("source_ds", "")
+            if not all([passage, question, hallucinated_answer]):
+                return None
+            prompt = self._build_prompt(passage, question)
+            # The hallucinated answer is the incorrect response
+            incorrect_answer = hallucinated_answer
+            # Generate a correct response that acknowledges the context
+            correct_answer = self._create_grounded_response(passage, question)
+            metadata = {
+                "label": "halueval",
+                "source": "flowaicom/HaluEval",
+                "id": doc_id,
+                "source_ds": source_ds,
+                "original_label": "FAIL",
+                "original_score": 0,
+                "is_hallucination_benchmark": True,
+                "pair_type": "synthetic_correct",
+            }
+            return self._build_pair(
+                question=prompt,
+                correct=correct_answer,
+                incorrect=incorrect_answer,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error(f"Error extracting pair from fail doc: {exc}", exc_info=True)
+            return None
+    def _build_prompt(self, passage: str, question: str) -> str:
+        """Build a prompt with context and question."""
+        return f"""Context: {passage}
+Question: {question}
+Please answer the question based only on the information provided in the context."""
+    def _create_grounded_response(self, passage: str, question: str) -> str:
+        """Create a grounded response that stays true to the passage."""
+        return (
+            f"Based on the provided context, I can tell you that: "
+            f"{passage} "
+            "I'll only answer based on what's directly stated in the passage."
+        )