PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py ADDED Viewed

@@ -0,0 +1,305 @@
+# coding/providers/livecodebench/provider.py
+from __future__ import annotations
+import json
+from typing import Iterable, Optional
+from ..core.atoms import CodingTask, Language
+class LiveCodeBenchProvider:
+    """
+    LiveCodeBench provider: loads real coding problems from HuggingFace.
+    Dataset: livecodebench/code_generation_lite
+    Supports Python problems from LeetCode, AtCoder, and CodeForces.
+    """
+    name = "livecodebench"
+    def __init__(
+        self,
+        language: Language = "python",
+        release_version: str = "all",
+        limit: Optional[int] = None,
+        platform: Optional[str] = None,
+    ):
+        """
+        Initialize LiveCodeBench provider.
+        Arguments:
+            language: Programming language (currently only "python" supported)
+            release_version: Version to load ("release_v1", "release_v2", "all")
+            limit: Maximum number of problems to load
+            platform: Filter by platform ("leetcode", "codeforces", "atcoder")
+        """
+        self.language = language
+        self.release_version = release_version
+        self.limit = limit
+        self.platform = platform
+        if language != "python":
+            raise NotImplementedError(
+                f"LiveCodeBench currently only supports Python. Got: {language}"
+            )
+    def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]:
+        """
+        Iterate over LiveCodeBench coding tasks.
+        Arguments:
+            split: Dataset split (only "test" is available for LiveCodeBench)
+        Yields:
+            CodingTask objects with solution file, test file, and options
+        """
+        from datasets import load_dataset
+        # Load dataset from HuggingFace
+        dataset = load_dataset("livecodebench/code_generation_lite", split=split)
+        # Filter by version (date range)
+        if self.release_version == "release_v1":
+            dataset = dataset.filter(
+                lambda x: x["contest_date"] >= "2023-05-01" and x["contest_date"] <= "2023-10-31"
+            )
+        elif self.release_version == "release_v2":
+            dataset = dataset.filter(
+                lambda x: x["contest_date"] >= "2023-11-01" and x["contest_date"] <= "2024-04-30"
+            )
+        # Filter by platform if specified
+        if self.platform:
+            platform_lower = self.platform.lower()
+            dataset = dataset.filter(
+                lambda x: x["platform"].lower() == platform_lower
+            )
+        # Apply limit
+        if self.limit:
+            dataset = dataset.select(range(min(self.limit, len(dataset))))
+        # Convert each problem to a CodingTask
+        for idx, problem in enumerate(dataset):
+            task = self._problem_to_task(problem, idx)
+            if task:
+                yield task
+    def _problem_to_task(self, problem: dict, idx: int) -> Optional[CodingTask]:
+        """
+        Convert a LiveCodeBench problem to a CodingTask.
+        Arguments:
+            problem: Problem dictionary from HuggingFace dataset
+            idx: Problem index
+        Returns:
+            CodingTask or None if conversion fails
+        """
+        try:
+            platform = problem["platform"].lower()
+            question_id = problem["question_id"]
+            # Parse test cases
+            public_tests = json.loads(problem["public_test_cases"])
+            if not public_tests:
+                return None
+            # Determine test type and generate appropriate test file
+            test_type = public_tests[0].get("testtype", "stdin")
+            if test_type == "functional":
+                # LeetCode-style: function calls with arguments
+                test_file = self._generate_functional_test(problem, public_tests)
+            else:
+                # stdin: CodeForces/AtCoder style
+                test_file = self._generate_stdin_test(problem, public_tests)
+            if not test_file:
+                return None
+            # Generate solution file template
+            solution_file = self._generate_solution_template(problem)
+            files = {
+                "solution.py": solution_file,
+                "tests.py": test_file,
+            }
+            options = {
+                "problem_id": question_id,
+                "platform": platform,
+                "difficulty": problem.get("difficulty", "unknown"),
+            }
+            return CodingTask(
+                language=self.language,
+                files=files,
+                options=options,
+            )
+        except Exception as e:
+            # Skip problematic problems
+            import logging
+            logging.warning(f"Failed to convert problem {idx}: {e}")
+            return None
+    def _generate_solution_template(self, problem: dict) -> str:
+        """
+        Generate a solution template from starter code or problem description.
+        Arguments:
+            problem: Problem dictionary
+        Returns:
+            Python solution template as string
+        """
+        starter_code = problem.get("starter_code", "").strip()
+        if starter_code:
+            # Use provided starter code
+            return starter_code
+        else:
+            # Generate minimal template for stdin problems
+            return """# Read input and solve the problem
+import sys
+def solve():
+    # Read input from stdin
+    lines = sys.stdin.read().strip().split('\\n')
+    # TODO: Implement solution
+    pass
+if __name__ == "__main__":
+    solve()
+"""
+    def _generate_functional_test(self, problem: dict, test_cases: list) -> str:
+        """
+        Generate test file for LeetCode-style functional tests.
+        Arguments:
+            problem: Problem dictionary
+            test_cases: List of test case dictionaries
+        Returns:
+            Python test file content
+        """
+        starter_code = problem.get("starter_code", "").strip()
+        if not starter_code:
+            return ""
+        # Extract class and method name from starter code
+        import re
+        class_match = re.search(r"class\s+(\w+)", starter_code)
+        method_match = re.search(r"def\s+(\w+)\s*\(", starter_code)
+        if not class_match or not method_match:
+            return ""
+        class_name = class_match.group(1)
+        method_name = method_match.group(1)
+        # Generate test file
+        test_code = f"""from solution import {class_name}
+def test_functional():
+    solution = {class_name}()
+"""
+        for i, test in enumerate(test_cases):
+            input_str = test.get("input", "")
+            expected_output = test.get("output", "")
+            # Parse input (typically JSON array where first element is the actual argument)
+            try:
+                # Try to evaluate as Python literal
+                import ast
+                parsed = ast.literal_eval(input_str)
+                # If it's a list with one element that's also a list, use that inner list
+                if isinstance(parsed, list) and len(parsed) == 1 and isinstance(parsed[0], list):
+                    args = [parsed[0]]
+                elif isinstance(parsed, list):
+                    args = [parsed]
+                else:
+                    args = [parsed]
+            except:
+                # Fallback: use raw string
+                args = [input_str]
+            # Parse expected output
+            try:
+                import ast
+                expected = ast.literal_eval(expected_output)
+            except:
+                expected = expected_output
+            # Generate assertion
+            args_str = ", ".join(repr(arg) for arg in args)
+            test_code += f"    # Test case {i + 1}\n"
+            test_code += f"    result = solution.{method_name}({args_str})\n"
+            test_code += f"    assert result == {repr(expected)}, f\"Test {i + 1} failed: {{result}} != {repr(expected)}\"\n\n"
+        test_code += "if __name__ == '__main__':\n"
+        test_code += "    test_functional()\n"
+        test_code += "    print('All tests passed!')\n"
+        return test_code
+    def _generate_stdin_test(self, problem: dict, test_cases: list) -> str:
+        """
+        Generate test file for stdin-based tests (CodeForces/AtCoder style).
+        Arguments:
+            problem: Problem dictionary
+            test_cases: List of test case dictionaries
+        Returns:
+            Python test file content
+        """
+        # For stdin tests, we run the solution and compare output
+        test_code = """import subprocess
+import sys
+def test_stdin():
+    test_cases = [
+"""
+        for i, test in enumerate(test_cases):
+            input_data = test.get("input", "")
+            expected_output = test.get("output", "")
+            test_code += f"        # Test case {i + 1}\n"
+            test_code += f"        ({repr(input_data)}, {repr(expected_output)}),\n"
+        test_code += """    ]
+    for i, (input_data, expected_output) in enumerate(test_cases):
+        # Run solution with input
+        proc = subprocess.run(
+            [sys.executable, "solution.py"],
+            input=input_data,
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        actual_output = proc.stdout.strip()
+        expected_output = expected_output.strip()
+        assert actual_output == expected_output, (
+            f"Test case {i + 1} failed:\\n"
+            f"  Input: {input_data[:100]}\\n"
+            f"  Expected: {expected_output[:200]}\\n"
+            f"  Got: {actual_output[:200]}"
+        )
+    print(f'All {len(test_cases)} test(s) passed!')
+if __name__ == '__main__':
+    test_stdin()
+"""
+        return test_code

wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile ADDED Viewed

@@ -0,0 +1,31 @@
+# coding/safe_docker/Dockerfile
+FROM python:3.12-slim
+# ---- base toolchain for Python/C++/Java + tini ----
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates build-essential g++ openjdk-21-jdk-headless tini bash time \
+ && rm -rf /var/lib/apt/lists/*
+# pytest for Python harnesses
+RUN pip install --no-cache-dir pytest==8.3.3
+# Python env niceties
+ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
+# ---- create unprivileged user (will run everything after setup) ----
+RUN useradd -m -u 10001 sandbox
+# Prepare writable dirs WHILE STILL ROOT, and give them to 'sandbox'
+# /runner : code runner home
+# /work   : ephemeral workspace (you'll mount tmpfs here at runtime)
+RUN install -d -m 0755 -o sandbox -g sandbox /runner \
+ && install -d -m 1777 -o sandbox -g sandbox /work
+# Drop privileges for all subsequent instructions
+USER sandbox
+WORKDIR /runner
+# Copy runner entrypoint with correct ownership
+COPY --chown=sandbox:sandbox entrypoint.py /runner/entrypoint.py
+ENTRYPOINT ["/usr/bin/tini","--","python","/runner/entrypoint.py"]

wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py ADDED Viewed

File without changes

wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py ADDED Viewed

File without changes

wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py ADDED Viewed

@@ -0,0 +1,105 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Protocol, runtime_checkable
+__all__ = ["Job", "Result", "LanguageRecipe", "SandboxExecutor"]
+@dataclass(frozen=True)
+class Job:
+    """How to build + run a submission inside a sandbox.
+    attributes:
+        language:
+            Programming language, e.g. "python", "cpp", "java".
+        compile_argv:
+            If not None, argv to compile the code (e.g. ["g++", "-o", "program", "solution.cpp"]).
+            If None, no compilation step is done.
+        run_argv:
+            argv to run the code (e.g. ["./program"] or ["python3", "solution.py"]).
+        cpu_limit_s:
+            CPU time limit in seconds (e.g. 3).
+        wall_timeout_s:
+            Wall clock timeout in seconds (e.g. 8).
+        mem_limit_mb:
+            Memory limit in megabytes (e.g. 4096).
+        fsize_mb:
+            Max file size in megabytes (e.g. 16).
+        nproc:
+            Max number of processes/threads (e.g. 128).
+        nofile:
+            Max number of open files (e.g. 512).
+    example:
+        >>> job = Job(
+        >>>     language="python",
+        >>>     compile_argv=None,
+        >>>     run_argv=["python3", "solution.py"],
+        >>>     cpu_limit_s=3,
+        >>>     wall_timeout_s=8,
+        >>>     mem_limit_mb=4096,
+        >>>     fsize_mb=16,
+        >>>     nproc=128,
+        >>>     nofile=512,
+        >>> )
+    """
+    language: str
+    compile_argv: list[str] | None
+    run_argv: list[str]
+    cpu_limit_s: int = 3
+    wall_timeout_s: int = 8
+    mem_limit_mb: int = 4096
+    fsize_mb: int = 16
+    nproc: int = 128
+    nofile: int = 512
+@dataclass(frozen=True)
+class Result:
+    """
+    Result of running a Job inside a sandbox.
+    attributes:
+        status:
+            One of "ok", "compile_error", "runtime_error", "timeout".
+        exit_code:
+            Exit code of the program (or compiler), or -1 if killed by timeout or OOM.
+        stdout:
+            Captured standard output (max 32k chars).
+        stderr:
+            Captured standard error (max 32k chars).
+        elapsed:
+            Wall clock time elapsed in seconds (float).
+    example:
+        >>> res = Result(
+        >>>     status="ok",
+        >>>     exit_code=0,
+        >>>     stdout="Hello, world!",
+        >>>     stderr="",
+        >>>     elapsed=1.23,
+        >>> )
+    """
+    status: str
+    exit_code: int
+    stdout: str
+    stderr: str
+    elapsed: float
+@runtime_checkable
+class LanguageRecipe(Protocol):
+    """
+    Knows how to create a Job for a given language and set of files.
+    attributes:
+        language:
+            The programming language this recipe supports, e.g. "python", "cpp", "java".
+    """
+    language: str
+    def make_job(self, **options) -> Job: ...
+@runtime_checkable
+class SandboxExecutor(Protocol):
+    """
+    Executes a Job inside a sandbox, given a read-only job dir of files.
+    """
+    def run(self, files: dict[str, str], job: Job) -> Result: ...

wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py ADDED Viewed

@@ -0,0 +1,143 @@
+from __future__ import annotations
+import json, os, subprocess, tempfile
+from typing import TYPE_CHECKING
+from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result, SandboxExecutor
+from wisent.core.errors import DockerRuntimeError
+if TYPE_CHECKING:
+    from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job
+__all__ = ["DockerSandboxExecutor"]
+DEFAULT_IMAGE = "coding/sandbox:polyglot-1.0"
+SAFE_FLAGS = [
+    "--rm", "--network=none",
+    "--pids-limit=256",
+    "--read-only",
+    "--cap-drop=ALL",
+    "--security-opt=no-new-privileges",
+]
+TMPFS_FLAGS = [
+    "--tmpfs", "/tmp:exec,mode=1777,size=134217728",
+    "--tmpfs", "/work:exec,mode=1777,size=268435456",
+]
+class DockerSandboxExecutor(SandboxExecutor):
+    """
+    Executes a Job inside a Docker container, given a read-only job dir of files.
+    """
+    def __init__(self, image: str = DEFAULT_IMAGE, runtime: str | None = None):
+        self.image = image
+        self.runtime = runtime
+        # Skip Docker health check if environment variable is set (useful for slow Docker Desktop on macOS)
+        if not os.environ.get('SKIP_DOCKER_HEALTH_CHECK', '').lower() == 'true':
+            self._check_docker_available()
+    def _check_docker_available(self) -> None:
+        """
+        Check if Docker daemon is running and accessible.
+        Raises:
+            RuntimeError: If Docker is not available or not running.
+        """
+        try:
+            result = subprocess.run(
+                ["docker", "info"],
+                capture_output=True,
+                text=True,
+                timeout=300
+            )
+            if result.returncode != 0:
+                raise DockerRuntimeError(reason=f"Docker daemon is not running: {result.stderr}")
+        except FileNotFoundError:
+            raise DockerRuntimeError(reason="Docker command not found. Please install Docker.")
+        except subprocess.TimeoutExpired:
+            raise DockerRuntimeError(reason="Docker command timed out. Docker daemon may be unresponsive.")
+    def run(self, files: dict[str, str], job: Job) -> Result:
+        """
+        Runs a Job inside a Docker container, given a read-only job dir of files.
+        arguments:
+            files:
+                A mapping of filename to file content, representing the job directory.
+            job:
+                The Job to execute.
+        exceptions:
+            Raises subprocess.CalledProcessError if the `docker` command itself fails.
+        returns:
+            A Result object with the outcome of the execution.
+        example (pythonm add function)
+        >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job, Result
+        >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.runtime import DockerSandboxExecutor
+        >>> job = Job(
+        ...     language="python",
+        ...     compile_argv=None,
+        ...     run_argv=["python3", "/job/tests.py"],
+        ...     cpu_limit_s=2,
+        ...     wall_timeout_s=5,
+        ...     mem_limit_mb=256,
+        ... )
+        >>> files = {
+        ...     "solution.py": "def add(a,b): return a + b",
+        ...     "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3",
+        ... }
+        >>> res: Result = DockerSandboxExecutor().run(files, job)
+        >>> res.status
+        'ok'
+        >>> res.exit_code
+        0
+        >>> res.stdout
+        'test_ok passed'
+        >>> res.stderr
+        ''
+        >>> round(res.elapsed, 2)
+        0.23
+        """
+        with tempfile.TemporaryDirectory() as tmp:
+            job_dir = os.path.join(tmp, "job")
+            os.makedirs(job_dir, exist_ok=True)
+            for name, content in files.items():
+                with open(os.path.join(job_dir, name), "w", encoding="utf-8") as f:
+                    f.write(content)
+            with open(os.path.join(job_dir, "job.json"), "w", encoding="utf-8") as f:
+                json.dump({
+                    "language": job.language,
+                    "compile": {"argv": job.compile_argv} if job.compile_argv else None,
+                    "run": {"argv": job.run_argv},
+                    "cpu_limit_s": job.cpu_limit_s,
+                    "wall_timeout_s": job.wall_timeout_s,
+                    "mem_limit_mb": job.mem_limit_mb,
+                    "fsize_mb": job.fsize_mb,
+                    "nproc": job.nproc,
+                    "nofile": job.nofile,
+                }, f)
+            base = ["docker"]
+            if self.runtime:
+                base += ["--runtime", self.runtime]
+            cmd = base + ["run", "-i", *SAFE_FLAGS, *TMPFS_FLAGS, "-v", f"{job_dir}:/job:ro", self.image]
+            p = subprocess.run(cmd, check=False, capture_output=True, text=True)
+            out = (p.stdout or "").strip()
+            try:
+                payload = json.loads(out)
+            except json.JSONDecodeError:
+                return Result(
+                    status="runtime_error",
+                    exit_code=p.returncode,
+                    stdout=p.stdout or "",
+                    stderr=p.stderr or "Failed to parse executor output as JSON.",
+                    elapsed=0.0,
+                )
+            return Result(
+                status=payload.get("status","runtime_error"),
+                exit_code=int(payload.get("exit_code", p.returncode)),
+                stdout=payload.get("stdout",""),
+                stderr=payload.get("stderr",""),
+                elapsed=float(payload.get("elapsed",0.0)),
+            )