PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/tasks/__init__.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""
+Task implementations for wisent.
+This package contains task-agnostic implementations for various benchmarks.
+"""
+from ..task_interface import register_task
+from .aime_task import AIMETask
+from .hle_task import HLEExactMatchTask, HLEMultipleChoiceTask, HLETask
+from .hmmt_task import HMMTTask
+from .livecodebench_task import LiveCodeBenchTask
+from .livemathbench_task import LiveMathBenchTask
+from .lm_eval_task import (
+    AppsTask,
+    ArcChallengeTask,
+    ArcEasyTask,
+    BoolqTask,
+    CodexglueCodeToTextGoTask,
+    CodexglueCodeToTextJavascriptTask,
+    CodexglueCodeToTextJavaTask,
+    CodexglueCodeToTextPhpTask,
+    CodexglueCodeToTextPythonTask,
+    CodexglueCodeToTextRubyTask,
+    ConalaTask,
+    ConcodeTask,
+    DS1000Task,
+    GSM8KTask,
+    HellaswagTask,
+    Arithmetic1dcTask,
+    Arithmetic2daTask,
+    Arithmetic2dmTask,
+    Arithmetic2dsTask,
+    Arithmetic3daTask,
+    Arithmetic3dsTask,
+    Arithmetic4daTask,
+    Arithmetic4dsTask,
+    Arithmetic5daTask,
+    Arithmetic5dsTask,
+    QA4MRETask,
+    MULTIRCTask,
+    HumanEvalPlusTask,
+    HumanEvalTask,
+    InstructHumanEvalTask,
+    MBPPPlusTask,
+    MBPPTask,
+    MercuryTask,
+    MMLUTask,
+    MultipleCppTask,
+    MultipleGoTask,
+    MultipleJavaTask,
+    MultipleJsTask,
+    MultiplePyTask,
+    MultipleRsTask,
+    OpenbookqaTask,
+    PiqaTask,
+    RecodeTask,
+    Squad2Task,
+    TruthfulQATask,
+    WinograndeTask,
+)
+from .math500_task import Math500Task
+from .polymath_task import PolyMathTask
+from .supergpqa_task import SuperGPQABiologyTask, SuperGPQAChemistryTask, SuperGPQAPhysicsTask, SuperGPQATask
+def register_all_tasks():
+    """Register all available tasks."""
+    # Register LiveCodeBench task
+    register_task("livecodebench", lambda limit=None: LiveCodeBenchTask(release_version="release_v1", limit=limit))
+    # Register common lm-eval tasks
+    register_task("gsm8k", GSM8KTask)
+    register_task("truthfulqa_mc1", TruthfulQATask)
+    register_task("mmlu", MMLUTask)
+    register_task("arc_easy", ArcEasyTask)
+    register_task("arc_challenge", ArcChallengeTask)
+    register_task("hellaswag", HellaswagTask)
+    register_task("winogrande", WinograndeTask)
+    register_task("piqa", PiqaTask)
+    register_task("boolq", BoolqTask)
+    register_task("openbookqa", OpenbookqaTask)
+    register_task("arithmetic_1dc", Arithmetic1dcTask)
+    register_task("arithmetic_2da", Arithmetic2daTask)
+    register_task("arithmetic_2dm", Arithmetic2dmTask)
+    register_task("arithmetic_2ds", Arithmetic2dsTask)
+    register_task("arithmetic_3da", Arithmetic3daTask)
+    register_task("arithmetic_3ds", Arithmetic3dsTask)
+    register_task("arithmetic_4da", Arithmetic4daTask)
+    register_task("arithmetic_4ds", Arithmetic4dsTask)
+    register_task("arithmetic_5da", Arithmetic5daTask)
+    register_task("arithmetic_5ds", Arithmetic5dsTask)
+    register_task("qa4mre_2013", QA4MRETask)
+    register_task("multirc", MULTIRCTask)
+    # Register all coding tasks
+    register_task("mbpp", MBPPTask)
+    register_task("humaneval", HumanEvalTask)
+    register_task("mbpp_plus", MBPPPlusTask)
+    register_task("instructhumaneval", InstructHumanEvalTask)
+    register_task("humaneval_plus", HumanEvalPlusTask)
+    register_task("conala", ConalaTask)
+    register_task("concode", ConcodeTask)
+    register_task("mercury", MercuryTask)
+    register_task("apps", AppsTask)
+    register_task("ds1000", DS1000Task)
+    register_task("multiple_py", MultiplePyTask)
+    register_task("multiple_js", MultipleJsTask)
+    register_task("multiple_java", MultipleJavaTask)
+    register_task("multiple_cpp", MultipleCppTask)
+    register_task("multiple_rs", MultipleRsTask)
+    register_task("multiple_go", MultipleGoTask)
+    register_task("codexglue_code_to_text_python", CodexglueCodeToTextPythonTask)
+    register_task("codexglue_code_to_text_go", CodexglueCodeToTextGoTask)
+    register_task("codexglue_code_to_text_ruby", CodexglueCodeToTextRubyTask)
+    register_task("codexglue_code_to_text_java", CodexglueCodeToTextJavaTask)
+    register_task("codexglue_code_to_text_javascript", CodexglueCodeToTextJavascriptTask)
+    register_task("codexglue_code_to_text_php", CodexglueCodeToTextPhpTask)
+    register_task("recode", RecodeTask)
+    register_task("squad2", Squad2Task)
+    # Register HLE tasks
+    register_task("hle", lambda limit=None: HLETask(limit=limit))
+    register_task("hle_exact_match", lambda limit=None: HLEExactMatchTask(limit=limit))
+    register_task("hle_multiple_choice", lambda limit=None: HLEMultipleChoiceTask(limit=limit))
+    # Register MATH-500 tasks
+    register_task("math500", lambda limit=None: Math500Task(limit=limit))
+    register_task("math", lambda limit=None: Math500Task(limit=limit))
+    register_task("hendrycks_math", lambda limit=None: Math500Task(limit=limit))
+    # Register AIME tasks (general + year-specific)
+    register_task("aime", lambda limit=None: AIMETask(year="2025", limit=limit))  # Default: latest year (2025)
+    register_task("aime2025", lambda limit=None: AIMETask(year="2025", limit=limit))
+    register_task("aime2024", lambda limit=None: AIMETask(year="2024", limit=limit))
+    # Register HMMT tasks (general + competition-specific)
+    register_task(
+        "hmmt", lambda limit=None: HMMTTask(competition="feb_2025", limit=limit)
+    )  # Default: latest competition
+    register_task("hmmt_feb_2025", lambda limit=None: HMMTTask(competition="feb_2025", limit=limit))
+    # Register PolyMath tasks (Chinese and English, medium difficulty)
+    register_task(
+        "polymath", lambda limit=None: PolyMathTask(language="en", difficulty="medium", limit=limit)
+    )  # Default: English medium
+    register_task(
+        "polymath_en_medium", lambda limit=None: PolyMathTask(language="en", difficulty="medium", limit=limit)
+    )
+    register_task(
+        "polymath_zh_medium", lambda limit=None: PolyMathTask(language="zh", difficulty="medium", limit=limit)
+    )
+    register_task("polymath_en_high", lambda limit=None: PolyMathTask(language="en", difficulty="high", limit=limit))
+    register_task("polymath_zh_high", lambda limit=None: PolyMathTask(language="zh", difficulty="high", limit=limit))
+    # Register LiveMathBench tasks (CNMO 2024 Chinese and English)
+    register_task("livemathbench", lambda limit=None: LiveMathBenchTask(language="en", limit=limit))  # Default: English
+    register_task("livemathbench_cnmo_en", lambda limit=None: LiveMathBenchTask(language="en", limit=limit))
+    register_task("livemathbench_cnmo_zh", lambda limit=None: LiveMathBenchTask(language="zh", limit=limit))
+    # Register SuperGPQA tasks (scientific reasoning)
+    register_task("supergpqa", lambda limit=None: SuperGPQATask(limit=limit))  # Default: all subjects
+    register_task("supergpqa_physics", lambda limit=None: SuperGPQAPhysicsTask(limit=limit))
+    register_task("supergpqa_chemistry", lambda limit=None: SuperGPQAChemistryTask(limit=limit))
+    register_task("supergpqa_biology", lambda limit=None: SuperGPQABiologyTask(limit=limit))
+# Auto-register tasks when the module is imported
+register_all_tasks()
+__all__ = [
+    "AIMETask",
+    "AppsTask",
+    "CodexglueCodeToTextGoTask",
+    "CodexglueCodeToTextJavaTask",
+    "CodexglueCodeToTextJavascriptTask",
+    "CodexglueCodeToTextPhpTask",
+    "CodexglueCodeToTextPythonTask",
+    "CodexglueCodeToTextRubyTask",
+    "ConalaTask",
+    "ConcodeTask",
+    "DS1000Task",
+    "GSM8KTask",
+    "HLEExactMatchTask",
+    "HLEMultipleChoiceTask",
+    "HLETask",
+    "HMMTTask",
+    "HumanEvalPlusTask",
+    "HumanEvalTask",
+    "InstructHumanEvalTask",
+    "LiveCodeBenchTask",
+    "LiveMathBenchTask",
+    "MBPPPlusTask",
+    "MBPPTask",
+    "MMLUTask",
+    "MULTIRCTASK",
+    "Math500Task",
+    "MercuryTask",
+    "MultipleCppTask",
+    "MultipleGoTask",
+    "MultipleJavaTask",
+    "MultipleJsTask",
+    "MultiplePyTask",
+    "MultipleRsTask",
+    "PolyMathTask",
+    "QA4MRETASK",
+    "RecodeTask",
+    "Squad2Task",
+    "SuperGPQABiologyTask",
+    "SuperGPQAChemistryTask",
+    "SuperGPQAPhysicsTask",
+    "SuperGPQATask",
+    "TruthfulQATask",
+    "register_all_tasks",
+]

wisent/core/tasks/aime_task.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""
+AIME task implementation for task-agnostic architecture.
+"""
+from typing import Dict, Any, List, Optional
+from ..task_interface import TaskInterface
+from ..benchmark_extractors import GSM8KExtractor
+from wisent.core.errors import InvalidValueError
+import datasets
+class AIMETask(TaskInterface):
+    """General AIME mathematical contest task implementation."""
+    # Dataset configurations for different years
+    DATASET_CONFIGS = {
+        "2024": {
+            "source": "Maxwell-Jia/AIME_2024",
+            "split": "train",
+            "fields": {"problem": "Problem", "answer": "Answer"},
+            "description": "30 high-difficulty AIME contest problems from 2024"
+        },
+        "2025": {
+            "source": "MathArena/aime_2025",
+            "split": "train",
+            "fields": {"problem": "problem", "answer": "answer"},
+            "description": "30 high-difficulty AIME contest problems from 2025 (MathArena)"
+        }
+    }
+    def __init__(self, year: str = "2025", limit: Optional[int] = None):
+        """
+        Initialize AIME task for specified year.
+        Args:
+            year: AIME year to load ("2024", "2025"). Default: "2025" (latest)
+            limit: Maximum number of samples to load
+        """
+        if year not in self.DATASET_CONFIGS:
+            available = list(self.DATASET_CONFIGS.keys())
+            raise InvalidValueError(param="year", reason=f"'{year}' not supported", value=available)
+        self.year = year
+        self.config = self.DATASET_CONFIGS[year]
+        self._limit = limit
+        self._data = None  # Cache for loaded data
+        self._extractor = GSM8KExtractor()  # Reuse enhanced GSM8K extractor
+    def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Load AIME data from HuggingFace for specified year."""
+        # Load dataset based on year configuration
+        dataset = datasets.load_dataset(
+            self.config["source"],
+            split=self.config["split"]
+        )
+        # Apply limit
+        effective_limit = limit or self._limit
+        if effective_limit:
+            dataset = dataset.select(range(min(effective_limit, len(dataset))))
+        # Convert to list and normalize field names
+        data = [dict(item) for item in dataset]
+        # Normalize field names for consistent processing
+        normalized_data = []
+        problem_field = self.config["fields"]["problem"]
+        answer_field = self.config["fields"]["answer"]
+        for item in data:
+            normalized_item = dict(item)  # Keep all original fields
+            # Ensure consistent field names for extractor
+            if problem_field in item:
+                normalized_item["Problem"] = item[problem_field]
+                normalized_item["question"] = item[problem_field]  # For question/answer format
+            if answer_field in item:
+                normalized_item["Answer"] = item[answer_field]
+                normalized_item["answer"] = item[answer_field]  # For question/answer format
+            normalized_data.append(normalized_item)
+        return normalized_data
+    def get_task_info(self) -> Dict[str, Any]:
+        """Get information about the AIME task."""
+        return {
+            "task_name": f"aime{self.year}" if self.year != "2025" else "aime",
+            "year": self.year,
+            "description": self.config["description"],
+            "source": self.config["source"],
+            "task_type": "text_generation",
+            "evaluation_method": "mathematical_equivalence"
+        }
+    def validate_sample(self, sample: Dict[str, Any]) -> bool:
+        """Validate that a sample has required AIME fields."""
+        problem_field = self.config["fields"]["problem"]
+        answer_field = self.config["fields"]["answer"]
+        return all(field in sample for field in [problem_field, answer_field])
+    def get_extractor(self) -> GSM8KExtractor:
+        """Get the benchmark extractor for this task."""
+        return self._extractor
+    def get_name(self) -> str:
+        """Get the task name."""
+        return f"aime{self.year}" if self.year != "2025" else "aime"
+    def get_description(self) -> str:
+        """Get the task description."""
+        return f"AIME {self.year} contest problems requiring advanced mathematical reasoning"
+    def get_categories(self) -> List[str]:
+        """Get the task categories."""
+        return ["mathematics", "reasoning", "contest", "text_generation"]
+    # Methods to match lm-eval interface
+    def has_validation_docs(self) -> bool:
+        """Check if task has validation documents."""
+        return False  # AIME doesn't have separate validation sets
+    def has_test_docs(self) -> bool:
+        """Check if task has test documents."""
+        return True  # All samples are considered test docs
+    def test_docs(self) -> List[Dict[str, Any]]:
+        """Get test documents."""
+        if self._data is None:
+            self._data = self.load_data()
+        return self._data
+    def validation_docs(self) -> List[Dict[str, Any]]:
+        """Get validation documents."""
+        return []  # No separate validation set
+    def doc_to_text(self, doc: Dict[str, Any]) -> str:
+        """Convert document to text prompt."""
+        return doc.get('Problem', '')

wisent/core/tasks/file_task.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+File-based task implementation for loading custom datasets from JSON files.
+This allows users to easily test the optimization pipeline with their own datasets
+without needing to implement complex task classes or modify the core system.
+"""
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from ..benchmark_extractors import GSM8KExtractor
+from ..task_interface import TaskInterface
+from wisent.core.errors import InvalidJSONError, FileLoadError, InvalidDataFormatError
+class FileTask(TaskInterface):
+    """Task that loads data from a JSON file."""
+    def __init__(self, file_path: str, task_name: Optional[str] = None, limit: Optional[int] = None):
+        """
+        Initialize a file-based task.
+        Args:
+            file_path: Path to JSON file containing the dataset
+            task_name: Optional custom name for the task (defaults to filename)
+            limit: Optional limit on number of samples to load
+        """
+        self.file_path = Path(file_path)
+        self._limit = limit
+        self._data = None  # Cache for loaded data
+        self._extractor = GSM8KExtractor()  # Reuse GSM8K extractor for QA format
+        # Set task name
+        if task_name:
+            self._task_name = task_name
+        else:
+            self._task_name = self.file_path.stem.lower()
+    def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """Load data from the JSON file."""
+        if not self.file_path.exists():
+            raise FileNotFoundError(f"Dataset file not found: {self.file_path}")
+        try:
+            with open(self.file_path, encoding="utf-8") as f:
+                data = json.load(f)
+        except json.JSONDecodeError as e:
+            raise InvalidJSONError(file_path=str(self.file_path), cause=e)
+        except Exception as e:
+            raise FileLoadError(file_path=str(self.file_path), cause=e)
+        # Ensure data is a list
+        if not isinstance(data, list):
+            raise InvalidDataFormatError(reason=f"JSON file must contain a list of objects, got {type(data).__name__}")
+        # Validate samples
+        for i, sample in enumerate(data):
+            if not self.validate_sample(sample):
+                raise InvalidDataFormatError(reason=f"Invalid sample at index {i}")
+        # Apply limit
+        effective_limit = limit or self._limit
+        if effective_limit:
+            data = data[: min(effective_limit, len(data))]
+        return data
+    def get_extractor(self) -> GSM8KExtractor:
+        """Get the benchmark extractor for this task."""
+        return self._extractor
+    def get_name(self) -> str:
+        """Get the task name."""
+        return self._task_name
+    def get_description(self) -> str:
+        """Get the task description."""
+        return f"Custom dataset loaded from {self.file_path.name}"
+    def get_categories(self) -> List[str]:
+        """Get the task categories."""
+        return ["custom", "file_based", "text_generation"]
+    def validate_sample(self, sample: Dict[str, Any]) -> bool:
+        """
+        Validate that a sample has the required format.
+        Expected format:
+        {
+            "question": "Question text",
+            "answer": "Expected answer"
+        }
+        Optional fields:
+        - "problem": Alternative to "question"
+        - Any other fields will be preserved but ignored
+        """
+        if not isinstance(sample, dict):
+            return False
+        # Check for question field (or alternative names)
+        question = sample.get("question") or sample.get("problem")
+        if not question or not isinstance(question, str):
+            return False
+        # Check for answer field
+        answer = sample.get("answer")
+        if answer is None:
+            return False
+        return True
+    # Methods to match lm-eval interface
+    def has_validation_docs(self) -> bool:
+        """Check if task has validation documents."""
+        return False  # File tasks don't have separate validation sets
+    def has_test_docs(self) -> bool:
+        """Check if task has test documents."""
+        return True  # All samples are considered test docs
+    def test_docs(self) -> List[Dict[str, Any]]:
+        """Get test documents."""
+        if self._data is None:
+            self._data = self.load_data()
+        return self._data
+    def validation_docs(self) -> List[Dict[str, Any]]:
+        """Get validation documents."""
+        return []  # No separate validation set
+    def doc_to_text(self, doc: Dict[str, Any]) -> str:
+        """Convert document to text prompt."""
+        question = doc.get("question") or doc.get("problem", "")
+        return f"Question: {question}\nAnswer:"
+    def get_task_info(self) -> Dict[str, Any]:
+        """Get information about the file task."""
+        return {
+            "task_name": self._task_name,
+            "description": self.get_description(),
+            "source": str(self.file_path),
+            "task_type": "text_generation",
+            "evaluation_method": "exact_match",
+            "num_samples": len(self.test_docs()) if self._data else "unknown",
+        }
+def create_file_task(file_path: str, task_name: Optional[str] = None) -> callable:
+    """
+    Create a task factory function for a file-based task.
+    This is the recommended way to create file tasks for registration.
+    Args:
+        file_path: Path to the JSON dataset file
+        task_name: Optional custom name for the task
+    Returns:
+        A factory function that creates FileTask instances
+    """
+    def task_factory(limit: Optional[int] = None) -> FileTask:
+        return FileTask(file_path=file_path, task_name=task_name, limit=limit)
+    return task_factory
+def register_file_task(task_name: str, file_path: str, registry=None):
+    """
+    Register a file-based task with the global task registry.
+    Args:
+        task_name: Name to register the task under
+        file_path: Path to the JSON dataset file
+        registry: Optional registry to use (defaults to global registry)
+    """
+    from ..task_interface import register_task
+    task_factory = create_file_task(file_path, task_name)
+    register_task(task_name, task_factory)
+def load_tasks_from_directory(directory: str, pattern: str = "*.json", prefix: str = ""):
+    """
+    Load all JSON files in a directory as tasks.
+    Args:
+        directory: Directory to search for JSON files
+        pattern: File pattern to match (default: "*.json")
+        prefix: Optional prefix to add to task names
+    """
+    directory_path = Path(directory)
+    if not directory_path.exists():
+        raise FileNotFoundError(f"Directory not found: {directory}")
+    if not directory_path.is_dir():
+        raise InvalidDataFormatError(reason=f"Path is not a directory: {directory}")
+    loaded_tasks = []
+    for json_file in directory_path.glob(pattern):
+        try:
+            task_name = f"{prefix}{json_file.stem}".lower()
+            register_file_task(task_name, str(json_file))
+            loaded_tasks.append(task_name)
+        except Exception as e:
+            print(f"Warning: Failed to load task from {json_file}: {e}")
+    return loaded_tasks