PyPI - wisent - Versions diffs - 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl - Mend

wisent 0.5.12py3-none-any.whl → 0.5.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (227) hide show

wisent/core/evaluators/benchmark_specific/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Benchmark-specific evaluators for lm-eval tasks.
+This module provides evaluation methods that match lm-eval's native approaches:
+- Log likelihood evaluation for multiple-choice tasks
+- Generation evaluation for text generation tasks
+- Exact match evaluation for precise answer matching
+- F1 evaluation for token-level comparison
+- Perplexity evaluation for language modeling
+- Personalization evaluation for personality trait manifestation
+"""
+from .log_likelihoods_evaluator import LogLikelihoodsEvaluator
+from .generation_evaluator import GenerationEvaluator
+from .exact_match_evaluator import ExactMatchEvaluator
+from .f1_evaluator import F1Evaluator
+from .perplexity_evaluator import PerplexityEvaluator
+from .personalization_evaluator import PersonalizationEvaluator
+__all__ = [
+    'LogLikelihoodsEvaluator',
+    'GenerationEvaluator',
+    'ExactMatchEvaluator',
+    'F1Evaluator',
+    'PerplexityEvaluator',
+    'PersonalizationEvaluator',
+]

wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py RENAMED Viewed

@@ -2,19 +2,19 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Callable, Iterable, Optional, TYPE_CHECKING
-from wisent.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
-from wisent.benchmarks.coding.safe_docker.recipes import RECIPE_REGISTRY
-from wisent.benchmarks.coding.metrics.core.atoms import SampleOutcome, Evaluator
+from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.runtime import DockerSandboxExecutor
+from wisent.core.evaluators.benchmark_specific.coding.safe_docker.recipes import RECIPE_REGISTRY
+from wisent.core.evaluators.benchmark_specific.coding.metrics.core.atoms import SampleOutcome, Evaluator
-from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema
-from wisent.benchmarks.coding.output_sanitizer.python_sanitizer import PythonStandardizer
-from wisent.benchmarks.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
-from wisent.benchmarks.coding.output_sanitizer.java_sanitizer import JavaStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.python_sanitizer import PythonStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.java_sanitizer import JavaStandardizer
 if TYPE_CHECKING:
-    from wisent.benchmarks.coding.safe_docker.core.atoms import Result
-    from wisent.benchmarks.coding.providers.core.atoms import Provider, CodingTask
-    from wisent.benchmarks.coding.output_sanitizer.core.atoms import CodeStandardizer
+    from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
+    from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import Provider, CodingTask
+    from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import CodeStandardizer
 RepairFn = Callable[[str, dict[str,str], str], dict[str,str]]
@@ -82,7 +82,7 @@ def _make_schema(task: CodingTask) -> TaskSchema:
         and allow_wrapper set appropriately.
     example:
-        >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
+        >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
         >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
         >>> schema = _make_schema(task)
         >>> schema.language
@@ -128,7 +128,7 @@ class CodingEvaluator(Evaluator):
             Feedback string summarizing the result, truncated to cfg.feedback_max_chars.
         examples:
-            >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
+            >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
             >>> res = Result(status="timeout", stdout="", stderr="", elapsed=10.0)
             >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig())
             >>> evaluator._feedback(res)
@@ -163,8 +163,8 @@ class CodingEvaluator(Evaluator):
             Result object containing the status, stdout, stderr, and elapsed time.
         examples:
-            >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
-            >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
+            >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
+            >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
             >>> task = CodingTask(language="python", files={}, options={})
             >>> files = {"solution.py": "def add(a,b): return a + b", "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3"}
             >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {})
@@ -181,7 +181,7 @@ class CodingEvaluator(Evaluator):
             0.23
         """
         recipe = RECIPE_REGISTRY[task.language]
-        job = recipe.make_job(files, **task.options,
+        job = recipe.make_job(**task.options,
                               time_limit_s=self.cfg.time_limit_s,
                               cpu_limit_s=self.cfg.cpu_limit_s,
                               mem_limit_mb=self.cfg.mem_limit_mb)
@@ -201,7 +201,7 @@ class CodingEvaluator(Evaluator):
             The sanitized files if pre_sanitize is True and a sanitizer exists for the language; otherwise, the original files.
         examples:
-            >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
+            >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
             >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
             >>> files = {"my_solution.py": "def add(a,b): return a - b  # BUG"}
             >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig(pre_sanitize=True))
@@ -234,7 +234,7 @@ class CodingEvaluator(Evaluator):
             SampleOutcome for each task, indicating pass/fail status and elapsed time.
         examples:
-            >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask, Provider
+            >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask, Provider
             >>> class DummyProvider:
             ...     name = "dummy"
             ...     def iter_tasks(self):

wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py RENAMED Viewed

@@ -1,8 +1,8 @@
 from __future__ import annotations
 import re
 from typing import List
-from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
-from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
 FUNC_RE = re.compile(r"^\s*(?:template<[^>]+>\s*)?(?:[\w:\s*&<>,]+)\s+(\w+)\s*\(", re.MULTILINE)
 CLASS_RE = re.compile(r"^\s*class\s+(\w+)\s*[{:]", re.MULTILINE)

wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py RENAMED Viewed

@@ -2,8 +2,8 @@
 from __future__ import annotations
 import re
 from typing import List
-from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
-from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
 CLASS_RE = re.compile(r"\bclass\s+([A-Za-z_]\w*)")
 METHOD_RE = re.compile(r"(public\s+static\s+[\w\<\>\[\]]+\s+)(\w+)\s*\(")

wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py RENAMED Viewed

@@ -2,8 +2,8 @@
 from __future__ import annotations
 import ast, re
 from typing import List
-from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
-from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
 class PythonStandardizer(CodeStandardizer):
     language = "python"

wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .provider import LiveCodeBenchProvider
+__all__ = ["LiveCodeBenchProvider"]

wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py ADDED Viewed

@@ -0,0 +1,305 @@
+# coding/providers/livecodebench/provider.py
+from __future__ import annotations
+import json
+from typing import Iterable, Optional
+from ..core.atoms import CodingTask, Language
+class LiveCodeBenchProvider:
+    """
+    LiveCodeBench provider: loads real coding problems from HuggingFace.
+    Dataset: livecodebench/code_generation_lite
+    Supports Python problems from LeetCode, AtCoder, and CodeForces.
+    """
+    name = "livecodebench"
+    def __init__(
+        self,
+        language: Language = "python",
+        release_version: str = "all",
+        limit: Optional[int] = None,
+        platform: Optional[str] = None,
+    ):
+        """
+        Initialize LiveCodeBench provider.
+        Arguments:
+            language: Programming language (currently only "python" supported)
+            release_version: Version to load ("release_v1", "release_v2", "all")
+            limit: Maximum number of problems to load
+            platform: Filter by platform ("leetcode", "codeforces", "atcoder")
+        """
+        self.language = language
+        self.release_version = release_version
+        self.limit = limit
+        self.platform = platform
+        if language != "python":
+            raise NotImplementedError(
+                f"LiveCodeBench currently only supports Python. Got: {language}"
+            )
+    def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]:
+        """
+        Iterate over LiveCodeBench coding tasks.
+        Arguments:
+            split: Dataset split (only "test" is available for LiveCodeBench)
+        Yields:
+            CodingTask objects with solution file, test file, and options
+        """
+        from datasets import load_dataset
+        # Load dataset from HuggingFace
+        dataset = load_dataset("livecodebench/code_generation_lite", split=split)
+        # Filter by version (date range)
+        if self.release_version == "release_v1":
+            dataset = dataset.filter(
+                lambda x: x["contest_date"] >= "2023-05-01" and x["contest_date"] <= "2023-10-31"
+            )
+        elif self.release_version == "release_v2":
+            dataset = dataset.filter(
+                lambda x: x["contest_date"] >= "2023-11-01" and x["contest_date"] <= "2024-04-30"
+            )
+        # Filter by platform if specified
+        if self.platform:
+            platform_lower = self.platform.lower()
+            dataset = dataset.filter(
+                lambda x: x["platform"].lower() == platform_lower
+            )
+        # Apply limit
+        if self.limit:
+            dataset = dataset.select(range(min(self.limit, len(dataset))))
+        # Convert each problem to a CodingTask
+        for idx, problem in enumerate(dataset):
+            task = self._problem_to_task(problem, idx)
+            if task:
+                yield task
+    def _problem_to_task(self, problem: dict, idx: int) -> Optional[CodingTask]:
+        """
+        Convert a LiveCodeBench problem to a CodingTask.
+        Arguments:
+            problem: Problem dictionary from HuggingFace dataset
+            idx: Problem index
+        Returns:
+            CodingTask or None if conversion fails
+        """
+        try:
+            platform = problem["platform"].lower()
+            question_id = problem["question_id"]
+            # Parse test cases
+            public_tests = json.loads(problem["public_test_cases"])
+            if not public_tests:
+                return None
+            # Determine test type and generate appropriate test file
+            test_type = public_tests[0].get("testtype", "stdin")
+            if test_type == "functional":
+                # LeetCode-style: function calls with arguments
+                test_file = self._generate_functional_test(problem, public_tests)
+            else:
+                # stdin: CodeForces/AtCoder style
+                test_file = self._generate_stdin_test(problem, public_tests)
+            if not test_file:
+                return None
+            # Generate solution file template
+            solution_file = self._generate_solution_template(problem)
+            files = {
+                "solution.py": solution_file,
+                "tests.py": test_file,
+            }
+            options = {
+                "problem_id": question_id,
+                "platform": platform,
+                "difficulty": problem.get("difficulty", "unknown"),
+            }
+            return CodingTask(
+                language=self.language,
+                files=files,
+                options=options,
+            )
+        except Exception as e:
+            # Skip problematic problems
+            import logging
+            logging.warning(f"Failed to convert problem {idx}: {e}")
+            return None
+    def _generate_solution_template(self, problem: dict) -> str:
+        """
+        Generate a solution template from starter code or problem description.
+        Arguments:
+            problem: Problem dictionary
+        Returns:
+            Python solution template as string
+        """
+        starter_code = problem.get("starter_code", "").strip()
+        if starter_code:
+            # Use provided starter code
+            return starter_code
+        else:
+            # Generate minimal template for stdin problems
+            return """# Read input and solve the problem
+import sys
+def solve():
+    # Read input from stdin
+    lines = sys.stdin.read().strip().split('\\n')
+    # TODO: Implement solution
+    pass
+if __name__ == "__main__":
+    solve()
+"""
+    def _generate_functional_test(self, problem: dict, test_cases: list) -> str:
+        """
+        Generate test file for LeetCode-style functional tests.
+        Arguments:
+            problem: Problem dictionary
+            test_cases: List of test case dictionaries
+        Returns:
+            Python test file content
+        """
+        starter_code = problem.get("starter_code", "").strip()
+        if not starter_code:
+            return ""
+        # Extract class and method name from starter code
+        import re
+        class_match = re.search(r"class\s+(\w+)", starter_code)
+        method_match = re.search(r"def\s+(\w+)\s*\(", starter_code)
+        if not class_match or not method_match:
+            return ""
+        class_name = class_match.group(1)
+        method_name = method_match.group(1)
+        # Generate test file
+        test_code = f"""from solution import {class_name}
+def test_functional():
+    solution = {class_name}()
+"""
+        for i, test in enumerate(test_cases):
+            input_str = test.get("input", "")
+            expected_output = test.get("output", "")
+            # Parse input (typically JSON array where first element is the actual argument)
+            try:
+                # Try to evaluate as Python literal
+                import ast
+                parsed = ast.literal_eval(input_str)
+                # If it's a list with one element that's also a list, use that inner list
+                if isinstance(parsed, list) and len(parsed) == 1 and isinstance(parsed[0], list):
+                    args = [parsed[0]]
+                elif isinstance(parsed, list):
+                    args = [parsed]
+                else:
+                    args = [parsed]
+            except:
+                # Fallback: use raw string
+                args = [input_str]
+            # Parse expected output
+            try:
+                import ast
+                expected = ast.literal_eval(expected_output)
+            except:
+                expected = expected_output
+            # Generate assertion
+            args_str = ", ".join(repr(arg) for arg in args)
+            test_code += f"    # Test case {i + 1}\n"
+            test_code += f"    result = solution.{method_name}({args_str})\n"
+            test_code += f"    assert result == {repr(expected)}, f\"Test {i + 1} failed: {{result}} != {repr(expected)}\"\n\n"
+        test_code += "if __name__ == '__main__':\n"
+        test_code += "    test_functional()\n"
+        test_code += "    print('All tests passed!')\n"
+        return test_code
+    def _generate_stdin_test(self, problem: dict, test_cases: list) -> str:
+        """
+        Generate test file for stdin-based tests (CodeForces/AtCoder style).
+        Arguments:
+            problem: Problem dictionary
+            test_cases: List of test case dictionaries
+        Returns:
+            Python test file content
+        """
+        # For stdin tests, we run the solution and compare output
+        test_code = """import subprocess
+import sys
+def test_stdin():
+    test_cases = [
+"""
+        for i, test in enumerate(test_cases):
+            input_data = test.get("input", "")
+            expected_output = test.get("output", "")
+            test_code += f"        # Test case {i + 1}\n"
+            test_code += f"        ({repr(input_data)}, {repr(expected_output)}),\n"
+        test_code += """    ]
+    for i, (input_data, expected_output) in enumerate(test_cases):
+        # Run solution with input
+        proc = subprocess.run(
+            [sys.executable, "solution.py"],
+            input=input_data,
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        actual_output = proc.stdout.strip()
+        expected_output = expected_output.strip()
+        assert actual_output == expected_output, (
+            f"Test case {i + 1} failed:\\n"
+            f"  Input: {input_data[:100]}\\n"
+            f"  Expected: {expected_output[:200]}\\n"
+            f"  Got: {actual_output[:200]}"
+        )
+    print(f'All {len(test_cases)} test(s) passed!')
+if __name__ == '__main__':
+    test_stdin()
+"""
+        return test_code

wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py RENAMED Viewed

@@ -1,10 +1,10 @@
 from __future__ import annotations
 import json, os, subprocess, tempfile
 from typing import TYPE_CHECKING
-from wisent.benchmarks.coding.safe_docker.core.atoms import Result, SandboxExecutor
+from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result, SandboxExecutor
 if TYPE_CHECKING:
-    from wisent.benchmarks.coding.safe_docker.core.atoms import Job
+    from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job
 __all__ = ["DockerSandboxExecutor"]
@@ -31,6 +31,38 @@ class DockerSandboxExecutor(SandboxExecutor):
     def __init__(self, image: str = DEFAULT_IMAGE, runtime: str | None = None):
         self.image = image
         self.runtime = runtime
+        self._check_docker_available()
+    def _check_docker_available(self) -> None:
+        """
+        Check if Docker daemon is running and accessible.
+        Raises:
+            RuntimeError: If Docker is not available or not running.
+        """
+        try:
+            result = subprocess.run(
+                ["docker", "info"],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            if result.returncode != 0:
+                raise RuntimeError(
+                    "Docker daemon is not running. Please start Docker and try again.\n"
+                    f"Error: {result.stderr}"
+                )
+        except FileNotFoundError:
+            raise RuntimeError(
+                "Docker command not found. Please install Docker:\n"
+                "  - macOS: https://docs.docker.com/desktop/install/mac-install/\n"
+                "  - Linux: https://docs.docker.com/engine/install/\n"
+                "  - Windows: https://docs.docker.com/desktop/install/windows-install/"
+            )
+        except subprocess.TimeoutExpired:
+            raise RuntimeError(
+                "Docker command timed out. Docker daemon may be unresponsive."
+            )
     def run(self, files: dict[str, str], job: Job) -> Result:
         """
@@ -49,8 +81,8 @@ class DockerSandboxExecutor(SandboxExecutor):
             A Result object with the outcome of the execution.
         example (pythonm add function)
-        >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Job, Result
-        >>> from wisent.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
+        >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job, Result
+        >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.runtime import DockerSandboxExecutor
         >>> job = Job(
         ...     language="python",
         ...     compile_argv=None,

wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py RENAMED Viewed

@@ -1,12 +1,10 @@
 from __future__ import annotations
 import json, os, shutil, subprocess, sys, time, signal, resource
-from wisent.benchmarks.coding.safe_docker.core.atoms import Job
 JOB_FILE = "/job/job.json"
 WORKDIR = "/work"
-def set_limits(job: Job):
+def set_limits(job):
     """
     Set resource limits for the sandboxed process.
@@ -25,7 +23,7 @@ def set_limits(job: Job):
     resource.setrlimit(resource.RLIMIT_CORE,(0,0))
     os.setsid()
-def run(argv: list[str], job: Job) -> tuple[int,str,str,float,str]:
+def run(argv: list[str], job) -> tuple[int,str,str,float,str]:
     """
     Run a command in a subprocess with resource limits.

wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import Dict
-from wisent.benchmarks.coding.safe_docker.core.atoms import Job, LanguageRecipe
+from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job, LanguageRecipe
 class PythonRecipe(LanguageRecipe):
     """

wisent 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl

Potentially problematic release.

wisent 0.5.12py3-none-any.whl → 0.5.14py3-none-any.whl