PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py ADDED Viewed

@@ -0,0 +1,177 @@
+from __future__ import annotations
+from typing import Any
+from wisent.core.cli_logger import setup_logger
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
+__all__ = ["CLUEWSCExtractor"]
+log = setup_logger(__name__)
+class CLUEWSCExtractor(HuggingFaceBenchmarkExtractor):
+    """
+    Extractor for CLUEWSC2020 - Chinese Winograd Schema Challenge.
+    CLUEWSC is part of the CLUE (Chinese Language Understanding Evaluation) benchmark,
+    focusing on coreference resolution in Chinese text. The task is to determine
+    whether a pronoun (span2) refers to a specific entity (span1) in the given text.
+    For coreference resolution:
+    - Positive (correct) = Correct identification of coreference relationship
+    - Negative (incorrect) = Wrong identification
+    Schema (clue/clue - cluewsc2020):
+        - idx: int (record identifier)
+        - text: str (input sentence with target spans)
+        - label: int (0=true, 1=false - whether spans are coreferent)
+        - span1_text: str (first entity/reference text)
+        - span2_text: str (second entity/reference text - usually pronoun)
+        - span1_index: int (position of first span in text)
+        - span2_index: int (position of second span in text)
+    """
+    # Evaluator that should be used for this benchmark
+    evaluator_name = "coreference_resolution"
+    def __init__(self, split: str = "validation"):
+        """
+        Initialize CLUEWSC extractor.
+        Args:
+            split: Dataset split to use ("train", "validation", "test")
+        """
+        super().__init__()
+        self.split = split
+    def extract_contrastive_pairs(
+        self,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from CLUEWSC examples.
+        For coreference resolution:
+        - Positive (correct) = Correct coreference judgment
+        - Negative (incorrect) = Wrong coreference judgment
+        Args:
+            limit: Optional maximum number of pairs to produce.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        max_items = self._normalize_limit(limit)
+        try:
+            docs = self.load_dataset(
+                dataset_name="clue",
+                config="cluewsc2020",
+                split=self.split,
+                limit=max_items,
+            )
+            log.info(f"Loaded {len(docs)} examples from CLUEWSC ({self.split})")
+        except Exception as e:
+            log.error(f"Failed to load CLUEWSC: {e}")
+            return []
+        pairs: list[ContrastivePair] = []
+        for doc in docs:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            log.warning("No valid CLUEWSC pairs extracted")
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single doc into a ContrastivePair.
+        Returns None when required fields are missing or malformed.
+        """
+        try:
+            idx = doc.get("idx", 0)
+            text = doc.get("text", "").strip()
+            label = doc.get("label", -1)  # 0 = true (coreferent), 1 = false
+            span1_text = doc.get("span1_text", "")
+            span2_text = doc.get("span2_text", "")
+            span1_index = doc.get("span1_index", -1)
+            span2_index = doc.get("span2_index", -1)
+            if not text or not span1_text or not span2_text:
+                log.debug("Skipping: missing text or spans")
+                return None
+            # Build the prompt
+            prompt = self._build_prompt(text, span1_text, span2_text)
+            # Determine correct answer based on label
+            # label 0 = true (coreferent), label 1 = false (not coreferent)
+            is_coreferent = (label == 0)
+            # Build correct and incorrect responses
+            correct_response = self._create_response(is_coreferent, span1_text, span2_text)
+            incorrect_response = self._create_response(not is_coreferent, span1_text, span2_text)
+            metadata = {
+                "label": "cluewsc",
+                "source": "clue/clue:cluewsc2020",
+                "idx": idx,
+                "split": self.split,
+                "span1_text": span1_text,
+                "span2_text": span2_text,
+                "span1_index": span1_index,
+                "span2_index": span2_index,
+                "is_coreferent": is_coreferent,
+                "is_chinese_benchmark": True,
+                "language": "zh",
+            }
+            return self._build_pair(
+                question=prompt,
+                correct=correct_response,
+                incorrect=incorrect_response,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
+            return None
+    def _build_prompt(self, text: str, span1_text: str, span2_text: str) -> str:
+        """Build the coreference resolution prompt."""
+        return f"""请判断以下中文句子中，"{span2_text}"是否指代"{span1_text}"。
+句子：{text}
+问题：在上述句子中，"{span2_text}"是否指代"{span1_text}"？请回答"是"或"否"。
+(Please determine whether "{span2_text}" refers to "{span1_text}" in the following Chinese sentence.
+Sentence: {text}
+Question: Does "{span2_text}" refer to "{span1_text}" in the above sentence? Please answer "Yes" or "No".)"""
+    def _create_response(self, is_coreferent: bool, span1_text: str, span2_text: str) -> str:
+        """Create a response for the coreference judgment."""
+        if is_coreferent:
+            return f"""是的，在这个句子中，"{span2_text}"指代"{span1_text}"。
+(Yes, in this sentence, "{span2_text}" refers to "{span1_text}".)
+答案：是 (Yes)"""
+        else:
+            return f"""不是，在这个句子中，"{span2_text}"不指代"{span1_text}"。
+(No, in this sentence, "{span2_text}" does not refer to "{span1_text}".)
+答案：否 (No)"""

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py ADDED Viewed

@@ -0,0 +1,92 @@
+from __future__ import annotations
+from typing import Any
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
+from wisent.core.cli_logger import setup_logger, bind
+__all__ = ["CnnDailymailExtractor"]
+_LOG = setup_logger(__name__)
+task_names = ("cnn_dailymail",)
+class CnnDailymailExtractor(HuggingFaceBenchmarkExtractor):
+    """Extractor for CNN/DailyMail - summarization task."""
+    evaluator_name = "generation"
+    def extract_contrastive_pairs(
+        self,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        log = bind(_LOG, task="cnn_dailymail")
+        max_items = self._normalize_limit(limit)
+        # Load dataset from HuggingFace
+        from datasets import load_dataset
+        try:
+            dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
+            if max_items:
+                dataset = dataset.select(range(min(max_items, len(dataset))))
+        except Exception as e:
+            log.error(f"Failed to load cnn_dailymail dataset: {e}")
+            return []
+        pairs: list[ContrastivePair] = []
+        log.info("Extracting contrastive pairs", extra={"doc_count": len(dataset)})
+        for doc in dataset:
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            log.warning("No valid pairs extracted", extra={"task": "cnn_dailymail"})
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        log = bind(_LOG, doc_id=doc.get("id", "unknown"))
+        try:
+            article = doc.get("article", "").strip()
+            highlights = doc.get("highlights", "").strip()
+            if not article or not highlights:
+                log.debug("Skipping doc due to missing article or highlights", extra={"doc": doc})
+                return None
+            correct_summary = highlights
+            # Create synthetic negative by shuffling sentences
+            import random
+            sentences = [s.strip() for s in highlights.split('.') if s.strip()]
+            if len(sentences) > 1:
+                shuffled_sentences = sentences.copy()
+                random.shuffle(shuffled_sentences)
+                if shuffled_sentences == sentences:
+                    shuffled_sentences = list(reversed(sentences))
+                incorrect_summary = '. '.join(shuffled_sentences) + '.'
+            else:
+                incorrect_summary = "This is an incorrect summary."
+            question = f"Summarize the following article:\n\n{article}"
+            metadata = {"label": "cnn_dailymail"}
+            return self._build_pair(
+                question=question,
+                correct=correct_summary,
+                incorrect=incorrect_summary,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error("Error extracting pair from doc", exc_info=exc, extra={"doc": doc})
+            return None

wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py ADDED Viewed

@@ -0,0 +1,378 @@
+from __future__ import annotations
+from typing import Any
+from wisent.core.cli_logger import setup_logger
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
+__all__ = ["CodeforcesExtractor"]
+log = setup_logger(__name__)
+class CodeforcesExtractor(HuggingFaceBenchmarkExtractor):
+    """
+    Extractor for Codeforces - Competitive Programming Benchmark.
+    Based on open-r1/codeforces dataset containing 10k+ competitive programming
+    problems from CodeForces with verified test cases and solutions.
+    Dataset Configurations:
+    - default: ~10k problems
+    - verifiable: 8,760 executable problems with complete/generated tests
+    - verifiable-prompts: Same with 2 generation prompts per problem
+    For code generation:
+    - Positive (correct) = Working solution that passes test cases
+    - Negative (incorrect) = Solution with bugs or wrong algorithm
+    Schema (open-r1/codeforces):
+        - id: str (unique problem identifier)
+        - title: str (problem title)
+        - description: str (problem statement)
+        - input_format: str (input description)
+        - output_format: str (output description)
+        - examples: list[dict] (example input/output pairs)
+        - official_tests: list[dict] (test cases)
+        - rating: int (problem difficulty rating)
+        - tags: list[str] (algorithm tags)
+        - time_limit: float (seconds)
+        - memory_limit: float (MB)
+        - editorial: str (solution explanation)
+    """
+    # Evaluator that should be used for this benchmark
+    evaluator_name = "code_generation"
+    def __init__(
+        self,
+        config: str = "verifiable",
+        max_rating: int | None = None,
+        min_rating: int | None = None,
+        language: str = "python",
+    ):
+        """
+        Initialize Codeforces extractor.
+        Args:
+            config: Dataset configuration ("default", "verifiable", "verifiable-prompts")
+            max_rating: Filter problems by maximum difficulty rating
+            min_rating: Filter problems by minimum difficulty rating
+            language: Target programming language
+        """
+        super().__init__()
+        self.config = config
+        self.max_rating = max_rating
+        self.min_rating = min_rating
+        self.language = language
+    def extract_contrastive_pairs(
+        self,
+        limit: int | None = None,
+    ) -> list[ContrastivePair]:
+        """
+        Build contrastive pairs from Codeforces examples.
+        For competitive programming:
+        - Positive (correct) = Working solution approach
+        - Negative (incorrect) = Wrong approach or buggy solution
+        Args:
+            limit: Optional maximum number of pairs to produce.
+        Returns:
+            A list of ContrastivePair objects.
+        """
+        max_items = self._normalize_limit(limit)
+        try:
+            docs = self.load_dataset(
+                dataset_name="open-r1/codeforces",
+                config=self.config,
+                split="train",
+                limit=max_items * 2 if max_items else None,  # Load extra for filtering
+            )
+            log.info(f"Loaded {len(docs)} problems from Codeforces ({self.config})")
+        except Exception as e:
+            log.warning(f"Failed to load open-r1/codeforces: {e}")
+            # Try alternative dataset
+            try:
+                docs = self.load_dataset(
+                    dataset_name="deepmind/code_contests",
+                    split="train",
+                    limit=max_items * 2 if max_items else None,
+                )
+                log.info(f"Loaded {len(docs)} problems from deepmind/code_contests")
+            except Exception as e2:
+                log.error(f"Failed to load any Codeforces dataset: {e2}")
+                return []
+        pairs: list[ContrastivePair] = []
+        for doc in docs:
+            # Filter by rating if specified
+            rating = doc.get("rating", 0)
+            if self.max_rating and rating > self.max_rating:
+                continue
+            if self.min_rating and rating < self.min_rating:
+                continue
+            pair = self._extract_pair_from_doc(doc)
+            if pair is not None:
+                pairs.append(pair)
+                if max_items is not None and len(pairs) >= max_items:
+                    break
+        if not pairs:
+            log.warning("No valid Codeforces pairs extracted")
+        return pairs
+    def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
+        """
+        Convert a single doc into a ContrastivePair.
+        Returns None when required fields are missing or malformed.
+        """
+        try:
+            problem_id = doc.get("id", "")
+            title = doc.get("title", "")
+            description = doc.get("description", "")
+            input_format = doc.get("input_format", "")
+            output_format = doc.get("output_format", "")
+            examples = doc.get("examples", [])
+            rating = doc.get("rating", 0)
+            tags = doc.get("tags", [])
+            time_limit = doc.get("time_limit", 1.0)
+            memory_limit = doc.get("memory_limit", 256.0)
+            editorial = doc.get("editorial", "")
+            note = doc.get("note", "")
+            if not description:
+                log.debug("Skipping: missing description")
+                return None
+            # Build the problem prompt
+            prompt = self._build_prompt(
+                title=title,
+                description=description,
+                input_format=input_format,
+                output_format=output_format,
+                examples=examples,
+                note=note,
+                time_limit=time_limit,
+                memory_limit=memory_limit,
+            )
+            # Build correct response (with proper approach)
+            correct_response = self._create_correct_response(
+                editorial=editorial,
+                tags=tags,
+                examples=examples,
+            )
+            # Build incorrect response (wrong approach)
+            incorrect_response = self._create_incorrect_response(tags)
+            metadata = {
+                "label": "codeforces",
+                "source": f"open-r1/codeforces:{self.config}",
+                "problem_id": problem_id,
+                "title": title,
+                "rating": rating,
+                "tags": tags if isinstance(tags, list) else [tags],
+                "time_limit": time_limit,
+                "memory_limit": memory_limit,
+                "has_editorial": bool(editorial),
+                "is_code_benchmark": True,
+            }
+            return self._build_pair(
+                question=prompt,
+                correct=correct_response,
+                incorrect=incorrect_response,
+                metadata=metadata,
+            )
+        except Exception as exc:
+            log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
+            return None
+    def _build_prompt(
+        self,
+        title: str,
+        description: str,
+        input_format: str,
+        output_format: str,
+        examples: list,
+        note: str,
+        time_limit: float,
+        memory_limit: float,
+    ) -> str:
+        """Build the problem prompt."""
+        parts = []
+        if title:
+            parts.append(f"# {title}")
+            parts.append("")
+        parts.append("## Problem Statement")
+        parts.append(description)
+        parts.append("")
+        if input_format:
+            parts.append("## Input Format")
+            parts.append(input_format)
+            parts.append("")
+        if output_format:
+            parts.append("## Output Format")
+            parts.append(output_format)
+            parts.append("")
+        if examples:
+            parts.append("## Examples")
+            for i, ex in enumerate(examples, 1):
+                inp = ex.get("input", "")
+                out = ex.get("output", "")
+                parts.append(f"### Example {i}")
+                parts.append(f"Input:\n```\n{inp}\n```")
+                parts.append(f"Output:\n```\n{out}\n```")
+                parts.append("")
+        if note:
+            parts.append("## Note")
+            parts.append(note)
+            parts.append("")
+        parts.append(f"Time Limit: {time_limit}s | Memory Limit: {memory_limit}MB")
+        parts.append("")
+        parts.append(f"Write a solution in {self.language}.")
+        return "\n".join(parts)
+    def _create_correct_response(
+        self,
+        editorial: str,
+        tags: list,
+        examples: list,
+    ) -> str:
+        """Create a correct response with proper approach."""
+        parts = []
+        # Add approach based on tags
+        if tags:
+            tag_list = ", ".join(tags) if isinstance(tags, list) else str(tags)
+            parts.append(f"## Approach")
+            parts.append(f"This problem involves: {tag_list}")
+            parts.append("")
+        # Add editorial if available
+        if editorial:
+            parts.append("## Solution Explanation")
+            parts.append(editorial)
+            parts.append("")
+        # Add solution structure
+        parts.append("## Solution")
+        parts.append(f"```{self.language}")
+        if self.language == "python":
+            parts.append(self._generate_python_template(tags))
+        else:
+            parts.append(self._generate_cpp_template(tags))
+        parts.append("```")
+        return "\n".join(parts)
+    def _generate_python_template(self, tags: list) -> str:
+        """Generate a Python solution template based on tags."""
+        tag_str = " ".join(tags) if isinstance(tags, list) else ""
+        if "dp" in tag_str or "dynamic programming" in tag_str:
+            return """def solve():
+    n = int(input())
+    # Initialize DP array
+    dp = [0] * (n + 1)
+    # Base case
+    dp[0] = 1
+    # Fill DP table
+    for i in range(1, n + 1):
+        # State transition
+        dp[i] = ...  # Fill based on problem logic
+    print(dp[n])
+solve()"""
+        elif "graph" in tag_str or "bfs" in tag_str or "dfs" in tag_str:
+            return """from collections import deque
+def solve():
+    n, m = map(int, input().split())
+    graph = [[] for _ in range(n + 1)]
+    for _ in range(m):
+        u, v = map(int, input().split())
+        graph[u].append(v)
+        graph[v].append(u)
+    # BFS/DFS traversal
+    visited = [False] * (n + 1)
+    # ... solution logic
+solve()"""
+        else:
+            return """def solve():
+    # Read input
+    n = int(input())
+    arr = list(map(int, input().split()))
+    # Process and compute answer
+    result = 0
+    # ... solution logic
+    print(result)
+solve()"""
+    def _generate_cpp_template(self, tags: list) -> str:
+        """Generate a C++ solution template."""
+        return """#include <bits/stdc++.h>
+using namespace std;
+int main() {
+    ios_base::sync_with_stdio(false);
+    cin.tie(NULL);
+    int n;
+    cin >> n;
+    // Solution logic here
+    return 0;
+}"""
+    def _create_incorrect_response(self, tags: list) -> str:
+        """Create an incorrect response with wrong approach."""
+        return f"""## Approach
+Let me try a brute force approach without considering the constraints.
+## Solution
+```{self.language}
+# WARNING: This solution is likely incorrect or will TLE
+def solve():
+    n = int(input())
+    # Naive O(n^2) or worse approach
+    result = 0
+    for i in range(n):
+        for j in range(n):
+            # This approach doesn't use the optimal algorithm
+            pass
+    print(result)
+solve()
+```
+Note: This solution does not use the optimal approach and may fail on large inputs or edge cases."""