PyPI - wisent - Versions diffs - 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

wisent 0.5.12py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (225) hide show

wisent/core/evaluators/benchmark_specific/coding/solution_generator.py ADDED Viewed

@@ -0,0 +1,258 @@
+"""
+LiveCodeBench solution generator using AI models and code execution.
+This module generates and evaluates solutions for LiveCodeBench problems,
+creating good/bad code pairs for contrastive learning.
+"""
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+from typing import Any, Callable, Optional
+from dataclasses import dataclass, asdict
+from wisent.core.evaluators.benchmark_specific.coding.providers.livecodebench.provider import LiveCodeBenchProvider
+from wisent.core.evaluators.benchmark_specific.coding.metrics.evaluator import CodingEvaluator, EvaluatorConfig
+from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
+@dataclass
+class SolutionExample:
+    """A single solution example with evaluation result."""
+    model: str
+    code: str
+    result: str  # "good" or "bad"
+    status: str  # "ok", "compile_error", "runtime_error", "timeout"
+    elapsed: float
+@dataclass
+class ProblemSolutions:
+    """Solutions for a single problem."""
+    question_id: str
+    good_example: Optional[dict[str, Any]] = None
+    bad_example: Optional[dict[str, Any]] = None
+    difficulty: str = "unknown"
+    all_solutions: list[dict[str, Any]] = None
+    def __post_init__(self):
+        if self.all_solutions is None:
+            self.all_solutions = []
+class LiveCodeBenchSolutionGenerator:
+    """
+    Generates and evaluates solutions for LiveCodeBench problems.
+    This replicates the wisent-core approach but as an independent system.
+    """
+    def __init__(
+        self,
+        model_fns: dict[str, Callable[[CodingTask], dict[str, str]]],
+        cache_dir: str = "./livecodebench_solutions",
+        evaluator_config: Optional[EvaluatorConfig] = None,
+    ):
+        """
+        Initialize the solution generator.
+        Args:
+            model_fns: Dictionary mapping model names to solution generation functions.
+                      Each function takes a CodingTask and returns a dict of files.
+            cache_dir: Directory to cache generated solutions.
+            evaluator_config: Optional configuration for code evaluation.
+        """
+        self.model_fns = model_fns
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.evaluator_config = evaluator_config or EvaluatorConfig(
+            image="coding/sandbox:polyglot-1.0",
+            self_repair=False,
+            time_limit_s=8,
+            cpu_limit_s=3,
+            mem_limit_mb=768,
+        )
+        self.cache_file = self.cache_dir / "solutions.json"
+        self._cached_solutions: Optional[dict[str, ProblemSolutions]] = None
+    def _load_cache(self) -> dict[str, ProblemSolutions]:
+        """Load cached solutions from disk."""
+        if self._cached_solutions is not None:
+            return self._cached_solutions
+        if not self.cache_file.exists():
+            self._cached_solutions = {}
+            return {}
+        with open(self.cache_file, 'r') as f:
+            data = json.load(f)
+        solutions_map = {}
+        for item in data.get("problems", []):
+            problem_id = item["question_id"]
+            solutions_map[problem_id] = ProblemSolutions(
+                question_id=problem_id,
+                good_example=item.get("good_example"),
+                bad_example=item.get("bad_example"),
+                difficulty=item.get("difficulty", "unknown"),
+                all_solutions=item.get("all_solutions", []),
+            )
+        self._cached_solutions = solutions_map
+        return solutions_map
+    def _save_cache(self, solutions: dict[str, ProblemSolutions]):
+        """Save solutions to disk."""
+        data = {
+            "total_problems": len(solutions),
+            "problems": [
+                {
+                    "question_id": ps.question_id,
+                    "good_example": ps.good_example,
+                    "bad_example": ps.bad_example,
+                    "difficulty": ps.difficulty,
+                    "all_solutions": ps.all_solutions,
+                }
+                for ps in solutions.values()
+            ]
+        }
+        with open(self.cache_file, 'w') as f:
+            json.dump(data, f, indent=2)
+        print(f"Saved {len(solutions)} problem solutions to {self.cache_file}")
+    def generate_solutions(
+        self,
+        limit: Optional[int] = None,
+        platform: Optional[str] = None,
+        release_version: str = "all",
+        skip_existing: bool = True,
+    ):
+        """
+        Generate solutions for LiveCodeBench problems using multiple AI models.
+        Args:
+            limit: Maximum number of problems to process.
+            platform: Filter by platform (leetcode, codeforces, atcoder).
+            release_version: Dataset version (release_v1, release_v2, all).
+            skip_existing: Skip problems that already have good/bad pairs.
+        """
+        # Load cache
+        cached_solutions = self._load_cache()
+        # Load problems
+        provider = LiveCodeBenchProvider(
+            language="python",
+            limit=limit,
+            platform=platform,
+            release_version=release_version,
+        )
+        problems_processed = 0
+        problems_skipped = 0
+        print(f"Processing LiveCodeBench problems...")
+        print(f"Models: {list(self.model_fns.keys())}")
+        for idx, task in enumerate(provider.iter_tasks()):
+            question_id = task.options.get("problem_id", f"unknown_{idx}")
+            # Skip if already has good/bad pair
+            if skip_existing and question_id in cached_solutions:
+                existing = cached_solutions[question_id]
+                if existing.good_example and existing.bad_example:
+                    problems_skipped += 1
+                    continue
+            print(f"\n[{idx + 1}] Processing {question_id}...")
+            # Generate solutions with each model
+            solutions = []
+            for model_name, model_fn in self.model_fns.items():
+                print(f"  - Generating with {model_name}...")
+                try:
+                    # Generate solution
+                    files = model_fn(task)
+                    # Evaluate solution
+                    evaluator = CodingEvaluator(
+                        provider=None,  # Not used for single evaluation
+                        model_fn=lambda _: files,
+                        repair_fn=None,
+                        cfg=self.evaluator_config,
+                    )
+                    result = evaluator._run_once(task, {**task.files, **files})
+                    # Determine if good or bad
+                    is_good = result.status == "ok"
+                    solution = SolutionExample(
+                        model=model_name,
+                        code=files.get("solution.py", ""),
+                        result="good" if is_good else "bad",
+                        status=result.status,
+                        elapsed=result.elapsed,
+                    )
+                    solutions.append(solution)
+                    print(f"    Result: {solution.result} ({solution.status}, {solution.elapsed:.2f}s)")
+                except Exception as e:
+                    print(f"    Error: {e}")
+                    continue
+            # Select best good and bad examples
+            good_solutions = [s for s in solutions if s.result == "good"]
+            bad_solutions = [s for s in solutions if s.result == "bad"]
+            problem_solution = ProblemSolutions(
+                question_id=question_id,
+                difficulty=task.options.get("difficulty", "unknown"),
+                all_solutions=[asdict(s) for s in solutions],
+            )
+            if good_solutions:
+                # Prefer fastest good solution
+                best_good = min(good_solutions, key=lambda s: s.elapsed)
+                problem_solution.good_example = asdict(best_good)
+            if bad_solutions:
+                # Prefer bad solution with fastest failure
+                best_bad = min(bad_solutions, key=lambda s: s.elapsed)
+                problem_solution.bad_example = asdict(best_bad)
+            # Update cache
+            cached_solutions[question_id] = problem_solution
+            problems_processed += 1
+            # Save periodically
+            if problems_processed % 10 == 0:
+                self._save_cache(cached_solutions)
+                print(f"\nProgress: {problems_processed} processed, {problems_skipped} skipped")
+        # Final save
+        self._save_cache(cached_solutions)
+        print(f"\n=== Generation Complete ===")
+        print(f"Problems processed: {problems_processed}")
+        print(f"Problems skipped: {problems_skipped}")
+        print(f"Total in cache: {len(cached_solutions)}")
+        # Summary statistics
+        with_good_bad = sum(1 for ps in cached_solutions.values() if ps.good_example and ps.bad_example)
+        print(f"Problems with good+bad pairs: {with_good_bad}")
+    def get_solutions(self, question_id: str) -> Optional[ProblemSolutions]:
+        """Get solutions for a specific problem."""
+        cached = self._load_cache()
+        return cached.get(question_id)
+    def get_all_solutions(self) -> dict[str, ProblemSolutions]:
+        """Get all cached solutions."""
+        return self._load_cache()

wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Exact match evaluator for benchmarks requiring exact string matching.
+Used for tasks like GSM8K, TriviaQA where the answer must match exactly
+(possibly after normalization).
+"""
+from typing import Any
+import logging
+from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
+logger = logging.getLogger(__name__)
+class ExactMatchEvaluator(BaseEvaluator):
+    """Evaluator using exact match for answer comparison.
+    Compatible with:
+    - GSM8K: Math problems (numerical exact match)
+    - TriviaQA: Factual questions (text exact match)
+    - Any task requiring exact answer matching
+    """
+    name = "exact_match"
+    description = "Exact match evaluator for precise answer comparison"
+    task_names = ("gsm8k", "triviaqa")
+    def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
+        """Evaluate using exact match.
+        Args:
+            response: Generated or extracted answer
+            expected: Expected answer (can be list of acceptable answers)
+            **kwargs:
+                normalize: Whether to normalize before comparison (default: True)
+                case_sensitive: Whether match is case-sensitive (default: False)
+        Returns:
+            EvalResult with TRUTHFUL/UNTRUTHFUL
+        """
+        normalize = kwargs.get('normalize', True)
+        case_sensitive = kwargs.get('case_sensitive', False)
+        # Handle list of acceptable answers
+        if isinstance(expected, list):
+            expected_answers = expected
+        else:
+            expected_answers = [expected]
+        # Prepare response
+        response_clean = str(response).strip()
+        if normalize:
+            response_clean = self.normalize_text(response_clean)
+        if not case_sensitive:
+            response_clean = response_clean.lower()
+        # Check each expected answer
+        for exp in expected_answers:
+            exp_clean = str(exp).strip()
+            if normalize:
+                exp_clean = self.normalize_text(exp_clean)
+            if not case_sensitive:
+                exp_clean = exp_clean.lower()
+            if response_clean == exp_clean:
+                return EvalResult(
+                    ground_truth="TRUTHFUL",
+                    method_used=self.name,
+                    confidence=1.0,
+                    details=f"Exact match: '{response}' == '{exp}'",
+                    meta={"matched_answer": exp}
+                )
+        return EvalResult(
+            ground_truth="UNTRUTHFUL",
+            method_used=self.name,
+            confidence=0.0,
+            details=f"No match: '{response}' not in {expected_answers}",
+        )

wisent/core/evaluators/benchmark_specific/f1_evaluator.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""F1 score evaluator for benchmarks requiring token-level comparison.
+Used for reading comprehension tasks where partial credit is appropriate.
+"""
+from typing import Any, Set
+import logging
+from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
+logger = logging.getLogger(__name__)
+class F1Evaluator(BaseEvaluator):
+    """Evaluator using F1 score for token-level comparison.
+    Compatible with:
+    - DROP: Reading comprehension with discrete reasoning
+    - SQuAD: Question answering
+    - Any task where partial token overlap should be rewarded
+    """
+    name = "f1"
+    description = "F1 score evaluator for token-level comparison"
+    task_names = ("drop", "squad")
+    def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
+        """Evaluate using F1 score.
+        Args:
+            response: Generated answer
+            expected: Expected answer (can be list of acceptable answers)
+            **kwargs:
+                normalize: Whether to normalize tokens (default: True)
+        Returns:
+            EvalResult with F1 score as confidence
+        """
+        normalize = kwargs.get('normalize', True)
+        # Handle list of acceptable answers - use best F1
+        if isinstance(expected, list):
+            expected_answers = expected
+        else:
+            expected_answers = [expected]
+        best_f1 = 0.0
+        best_match = None
+        for exp in expected_answers:
+            f1 = self._compute_f1(response, str(exp), normalize)
+            if f1 > best_f1:
+                best_f1 = f1
+                best_match = exp
+        # Determine ground truth based on F1 threshold
+        if best_f1 >= 0.8:
+            ground_truth = "TRUTHFUL"
+        elif best_f1 >= 0.5:
+            ground_truth = "UNKNOWN"  # Partial match
+        else:
+            ground_truth = "UNTRUTHFUL"
+        return EvalResult(
+            ground_truth=ground_truth,
+            method_used=self.name,
+            confidence=best_f1,
+            details=f"F1 score: {best_f1:.3f} (response vs '{best_match}')",
+            meta={"f1_score": best_f1, "matched_answer": best_match}
+        )
+    def _compute_f1(self, response: str, expected: str, normalize: bool = True) -> float:
+        """Compute F1 score between response and expected.
+        Returns:
+            F1 score in [0, 1]
+        """
+        # Tokenize
+        response_tokens = self._tokenize(response, normalize)
+        expected_tokens = self._tokenize(expected, normalize)
+        if not response_tokens or not expected_tokens:
+            return 0.0
+        # Compute precision and recall
+        common = response_tokens & expected_tokens
+        if not common:
+            return 0.0
+        precision = len(common) / len(response_tokens)
+        recall = len(common) / len(expected_tokens)
+        # Compute F1
+        f1 = 2 * (precision * recall) / (precision + recall)
+        return f1
+    def _tokenize(self, text: str, normalize: bool = True) -> Set[str]:
+        """Tokenize text into set of tokens."""
+        if normalize:
+            text = self.normalize_text(text)
+        return set(text.split())

wisent/core/evaluators/benchmark_specific/generation_evaluator.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Generation-based evaluator for benchmarks that require text generation.
+This evaluator handles tasks like GSM8K, DROP, TriviaQA where the model generates
+free-form text that must be parsed and compared to reference answers.
+"""
+import re
+from typing import Any, Dict
+import logging
+from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
+logger = logging.getLogger(__name__)
+class GenerationEvaluator(BaseEvaluator):
+    """Evaluator for generation-based benchmarks.
+    Handles tasks that require text generation followed by answer extraction:
+    - GSM8K: Math word problems requiring numerical answers
+    - DROP: Reading comprehension with discrete reasoning
+    - TriviaQA: Trivia questions requiring factual answers
+    Methods:
+    - Exact match: Direct string comparison
+    - Normalized match: Lenient comparison after normalization
+    - Numerical match: For math problems
+    """
+    name = "generation"
+    description = "Generation-based evaluator for text generation tasks"
+    task_names = ("gsm8k", "drop", "triviaqa")
+    def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
+        """Evaluate generated response against expected answer.
+        Args:
+            response: Generated model response
+            expected: Expected answer (str, int, float, or list of acceptable answers)
+            **kwargs:
+                task_name: Task name for task-specific logic
+                answer_type: Type of answer ("numerical", "text", "exact")
+                normalize: Whether to normalize strings before comparison
+        Returns:
+            EvalResult with TRUTHFUL/UNTRUTHFUL/UNKNOWN
+        """
+        task_name = kwargs.get('task_name', '')
+        answer_type = kwargs.get('answer_type', 'text')
+        normalize = kwargs.get('normalize', True)
+        # Extract answer from generated response
+        extracted_answer = self._extract_answer(response, task_name, answer_type)
+        if extracted_answer is None:
+            return EvalResult(
+                ground_truth="UNKNOWN",
+                method_used=self.name,
+                confidence=0.0,
+                details="Could not extract answer from generated response",
+            )
+        # Normalize expected answer
+        if isinstance(expected, list):
+            expected_answers = expected
+        else:
+            expected_answers = [expected]
+        # Check if extracted answer matches any expected answer
+        is_correct, matched_answer, confidence = self._check_match(
+            extracted_answer, expected_answers, answer_type, normalize
+        )
+        return EvalResult(
+            ground_truth="TRUTHFUL" if is_correct else "UNTRUTHFUL",
+            method_used=self.name,
+            confidence=confidence,
+            details=f"Extracted: '{extracted_answer}', Expected: '{matched_answer or expected_answers[0]}'",
+            meta={
+                "extracted_answer": extracted_answer,
+                "expected_answers": expected_answers,
+                "matched_answer": matched_answer,
+                "answer_type": answer_type,
+            }
+        )
+    def _extract_answer(self, response: str, task_name: str, answer_type: str) -> Any:
+        """Extract answer from generated response."""
+        if answer_type == "numerical" or task_name == "gsm8k":
+            return self._extract_numerical_answer(response)
+        else:
+            return self._extract_text_answer(response)
+    def _extract_numerical_answer(self, response: str) -> float:
+        """Extract numerical answer from response (for math problems)."""
+        # Look for common patterns
+        patterns = [
+            r'####\s*([-+]?\d*\.?\d+)',  # GSM8K format
+            r'answer\s*is\s*([-+]?\d*\.?\d+)',
+            r'=\s*([-+]?\d*\.?\d+)\s*$',
+            r'\$?\s*([-+]?\d*\.?\d+)',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, response, re.IGNORECASE)
+            if match:
+                try:
+                    return float(match.group(1))
+                except ValueError:
+                    continue
+        # Fallback: find last number in response
+        numbers = re.findall(r'[-+]?\d*\.?\d+', response)
+        if numbers:
+            try:
+                return float(numbers[-1])
+            except ValueError:
+                pass
+        return None
+    def _extract_text_answer(self, response: str) -> str:
+        """Extract text answer from response."""
+        # Look for explicit answer markers
+        patterns = [
+            r'answer\s*is:?\s*(.+?)(?:\n|$)',
+            r'final\s+answer:?\s*(.+?)(?:\n|$)',
+            r'(?:^|\n)answer:?\s*(.+?)(?:\n|$)',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, response, re.IGNORECASE)
+            if match:
+                return match.group(1).strip()
+        # Fallback: use first sentence
+        sentences = re.split(r'[.!?]\s+', response)
+        if sentences:
+            return sentences[0].strip()
+        return response.strip()
+    def _check_match(
+        self, extracted: Any, expected_list: list, answer_type: str, normalize: bool
+    ) -> tuple:
+        """Check if extracted answer matches any expected answer.
+        Returns:
+            (is_correct, matched_answer, confidence)
+        """
+        if answer_type == "numerical":
+            return self._check_numerical_match(extracted, expected_list)
+        else:
+            return self._check_text_match(extracted, expected_list, normalize)
+    def _check_numerical_match(self, extracted: float, expected_list: list) -> tuple:
+        """Check numerical match with tolerance."""
+        if extracted is None:
+            return False, None, 0.0
+        for expected in expected_list:
+            try:
+                expected_num = float(expected)
+                # Check if close enough (tolerance for floating point)
+                if abs(extracted - expected_num) < 1e-6:
+                    return True, expected, 1.0
+            except (ValueError, TypeError):
+                continue
+        return False, None, 0.0
+    def _check_text_match(self, extracted: str, expected_list: list, normalize: bool) -> tuple:
+        """Check text match with optional normalization."""
+        if extracted is None:
+            return False, None, 0.0
+        if normalize:
+            extracted_norm = self.normalize_text(extracted)
+        else:
+            extracted_norm = extracted
+        for expected in expected_list:
+            expected_str = str(expected)
+            if normalize:
+                expected_norm = self.normalize_text(expected_str)
+            else:
+                expected_norm = expected_str
+            # Exact match
+            if extracted_norm == expected_norm:
+                return True, expected, 1.0
+            # Substring match
+            if extracted_norm in expected_norm or expected_norm in extracted_norm:
+                return True, expected, 0.8
+        return False, None, 0.0

wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} RENAMED Viewed

@@ -9,13 +9,21 @@ directly on each choice to evaluate performance against known ground truth.
 import logging
 from typing import Any, Dict, Optional
+from dataclasses import dataclass
-from wisent.core.activations import ActivationAggregationStrategy, Activations
-from wisent.core.layer import Layer
+from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+from wisent.core.activations.activations import Activations
 logger = logging.getLogger(__name__)
+@dataclass
+class Layer:
+    """Simple layer metadata class."""
+    index: int
+    type: str = "transformer"
 class LogLikelihoodsEvaluator:
     """
     Evaluator for log-likelihoods based ground truth assessment.

wisent 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl

Potentially problematic release.

wisent 0.5.12py3-none-any.whl → 0.5.13py3-none-any.whl