PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py ADDED Viewed

@@ -0,0 +1,67 @@
+# coding/metrics/passk.py
+from __future__ import annotations
+from typing import Iterable
+import math
+from collections import defaultdict
+from .core.atoms import SampleOutcome, Metric
+from wisent.core.errors import InvalidRangeError
+class PassAtK(Metric):
+    """
+    Exact Pass@k for code generation.
+    """
+    def __init__(self, k: int = 1):
+        if k < 1:
+            raise InvalidRangeError(param_name="k", actual=k, min_val=1)
+        self.k = k
+    def compute(self, outcomes: Iterable[SampleOutcome]) -> float:
+        """
+        Aggregate counts per task_id
+        arguments:
+            outcomes: Iterable of SampleOutcome objects
+        returns:
+            Average Pass@k score across tasks
+        intuition:
+            For each task, we have n samples, c of which pass.
+            We want the probability that at least one of k random picks from these n samples is a passing one.
+            This is 1 - (combinations of picking k from the n-c failing ones) / (combinations of picking k from all n).
+            We then average this score across all tasks.
+        """
+        per_task_counts = defaultdict(lambda: {"n": 0, "c": 0})
+        for o in outcomes:
+            d = per_task_counts[o.task_id]
+            d["n"] += 1
+            d["c"] += 1 if o.passed else 0
+        if not per_task_counts:
+            return 0.0
+        scores_sum = 0.0
+        task_cnt = 0
+        for counts in per_task_counts.values():
+            n = counts["n"]
+            c = counts["c"]
+            if n <= 0:
+                continue
+            k = min(self.k, n)
+            if c <= 0:
+                score = 0.0
+            elif k == 0:
+                score = 0.0
+            elif k == 1:
+                score = c / n
+            else:
+                denom = math.comb(n, k)
+                num = math.comb(n - c, k) if k <= (n - c) else 0
+                score = 1.0 - (num / denom if denom > 0 else 0.0)
+            scores_sum += score
+            task_cnt += 1
+        return 0.0 if task_cnt == 0 else scores_sum / task_cnt

wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py ADDED Viewed

File without changes

wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py ADDED Viewed

File without changes

wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py ADDED Viewed

@@ -0,0 +1,27 @@
+# coding/llm_sanitizer/core/atoms.py
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, Protocol, Literal, Optional
+Language = Literal["python", "cpp", "java"]
+@dataclass(frozen=True)
+class TaskSchema:
+    """What the sandbox expects for this task."""
+    language: Language
+    file_name: str               # e.g., "solution.py" | "solution.cpp" | "Solution.java"
+    entry_point: str             # function/method name tests will call (e.g., "add", "solve")
+    java_class: str = "Solution" # only for Java; expected public class name
+    # Optional hints:
+    allow_wrapper: bool = True   # may synthesize thin wrapper instead of renaming
+    prefer_rename: bool = False  # if True and safe, rename single top-level function to entry_point
+@dataclass(frozen=True)
+class NormalizeResult:
+    files: Dict[str, str]        # filename -> normalized source
+    notes: str                   # human-readable log of what was done
+    ok: bool                     # True if we think it’s valid / parseable
+class CodeStandardizer(Protocol):
+    language: Language
+    def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult: ...

wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py ADDED Viewed

@@ -0,0 +1,62 @@
+from __future__ import annotations
+import re
+from typing import List
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
+FUNC_RE = re.compile(r"^\s*(?:template<[^>]+>\s*)?(?:[\w:\s*&<>,]+)\s+(\w+)\s*\(", re.MULTILINE)
+CLASS_RE = re.compile(r"^\s*class\s+(\w+)\s*[{:]", re.MULTILINE)
+class CppStandardizer(CodeStandardizer):
+    language = "cpp"
+    def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
+        notes: List[str] = []
+        code = normalize_whitespace(extract_code_block(raw, prefer_langs=("cpp","c++","cc","c")))
+        code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
+        if re.search(rf"\b{re.escape(schema.entry_point)}\s*\(", code):
+            notes.append(f"found function '{schema.entry_point}'")
+            return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
+        classes = CLASS_RE.findall(code)
+        for cls in classes:
+            if re.search(rf"\b{re.escape(cls)}\s*::\s*{re.escape(schema.entry_point)}\s*\(", code) or \
+               re.search(rf"class\s+{re.escape(cls)}.*?\b{re.escape(schema.entry_point)}\s*\(", code, flags=re.S):
+                notes.append(f"found {cls}::{schema.entry_point}; adding free-function shim")
+                shim = (
+                    f"\n\ntemplate <typename... Args>\n"
+                    f"auto {schema.entry_point}(Args&&... args)\n"
+                    f"    -> decltype({cls}().{schema.entry_point}(std::forward<Args>(args)...)) {{\n"
+                    f"    return {cls}().{schema.entry_point}(std::forward<Args>(args)...);\n"
+                    f"}}\n"
+                )
+                if "#include <utility>" not in code:
+                    code = "#include <utility>\n" + code
+                return NormalizeResult(files={schema.file_name: code + shim}, notes="\n".join(notes), ok=True)
+        candidates = [m.group(1) for m in FUNC_RE.finditer(code)]
+        if schema.prefer_rename and len(candidates) == 1:
+            old = candidates[0]
+            if old != schema.entry_point:
+                notes.append(f"renaming free function '{old}' -> '{schema.entry_point}'")
+                code2 = re.sub(rf"(\b){re.escape(old)}(\s*\()", rf"\1{schema.entry_point}\2", code)
+                return NormalizeResult(files={schema.file_name: code2}, notes="\n".join(notes), ok=True)
+        if candidates:
+            target = candidates[0]
+            if target != schema.entry_point:
+                notes.append(f"adding forwarding wrapper {schema.entry_point} -> {target}")
+                shim = (
+                    f"\n\ntemplate <typename... Args>\n"
+                    f"auto {schema.entry_point}(Args&&... args)\n"
+                    f"    -> decltype({target}(std::forward<Args>(args)...)) {{\n"
+                    f"    return {target}(std::forward<Args>(args)...);\n"
+                    f"}}\n"
+                )
+                if "#include <utility>" not in code:
+                    code = "#include <utility>\n" + code
+                return NormalizeResult(files={schema.file_name: code + shim}, notes="\n".join(notes), ok=True)
+        notes.append("no obvious function; returned normalized source only")
+        return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)

wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py ADDED Viewed

@@ -0,0 +1,78 @@
+# coding/llm_sanitizer/java_sanitizer.py
+from __future__ import annotations
+import re
+from typing import List
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
+CLASS_RE = re.compile(r"\bclass\s+([A-Za-z_]\w*)")
+METHOD_RE = re.compile(r"(public\s+static\s+[\w\<\>\[\]]+\s+)(\w+)\s*\(")
+class JavaStandardizer(CodeStandardizer):
+    language = "java"
+    def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
+        notes: List[str] = []
+        code = normalize_whitespace(extract_code_block(raw, prefer_langs=("java")))
+        code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
+        m = CLASS_RE.search(code)
+        if m:
+            found = m.group(1)
+            if found != schema.java_class:
+                notes.append(f"renaming class '{found}' -> '{schema.java_class}'")
+                code = re.sub(rf"\bclass\s+{re.escape(found)}\b", f"class {schema.java_class}", code, count=1)
+        if not CLASS_RE.search(code):
+            notes.append(f"wrapping code in class {schema.java_class}")
+            code = f"public class {schema.java_class} {{\n{indent(code)}\n}}\n"
+        static_methods = list(METHOD_RE.finditer(code))
+        if any(m.group(2) == schema.entry_point for m in static_methods):
+            notes.append(f"found public static '{schema.entry_point}'")
+            return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
+        if len(static_methods) == 1 and schema.prefer_rename:
+            old = static_methods[0].group(2)
+            if old != schema.entry_point:
+                notes.append(f"renaming static method '{old}' -> '{schema.entry_point}'")
+                code = re.sub(rf"(\bpublic\s+static\s+[\w\<\>\[\]]+\s+){re.escape(old)}(\s*\()",
+                              rf"\1{schema.entry_point}\2", code, count=1)
+                return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
+        if re.search(rf"\b{schema.entry_point}\s*\(", code):
+            notes.append(f"adding static wrapper for instance method '{schema.entry_point}'")
+            wrapper = (
+                f"\n    public static <T> Object {schema.entry_point}(Object... args) {{\n"
+                f"        {schema.java_class} _x = new {schema.java_class}();\n"
+                f"        try {{\n"
+                f"            // attempt reflective dispatch to instance method\n"
+                f"            Class<?>[] types = new Class<?>[args.length];\n"
+                f"            for (int i=0;i<args.length;i++) types[i] = args[i].getClass();\n"
+                f"            return {schema.java_class}.class.getMethod(\"{schema.entry_point}\", types).invoke(_x, args);\n"
+                f"        }} catch (Exception ex) {{ throw new RuntimeException(ex); }}\n"
+                f"    }}\n"
+            )
+            code = re.sub(rf"(class\s+{schema.java_class}\s*{{)", r"\1" + wrapper, code, count=1)
+            return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
+        notes.append("no suitable method; adding delegating static method to first public static or instance method via reflection")
+        fallback = (
+            f"\n    public static Object {schema.entry_point}(Object... args) {{\n"
+            f"        try {{\n"
+            f"            // try any public method first\n"
+            f"            for (var m : {schema.java_class}.class.getMethods()) {{\n"
+            f"                if (m.getName().equals(\"{schema.entry_point}\")) continue;\n"
+            f"                try {{ return m.invoke(m.getParameterCount()==0? new {schema.java_class}(): new {schema.java_class}(), args); }}\n"
+            f"                catch (Exception ignored) {{}}\n"
+            f"            }}\n"
+            f"        }} catch (Exception e) {{ throw new RuntimeException(e); }}\n"
+            f"        throw new RuntimeException(\"No suitable method for entry point\");\n"
+            f"    }}\n"
+        )
+        code = re.sub(rf"(class\s+{schema.java_class}\s*{{)", r"\1" + fallback, code, count=1)
+        return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=True)
+def indent(s: str, n: int = 4) -> str:
+    pad = " " * n
+    return "\n".join(pad + line if line.strip() else line for line in s.splitlines())

wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py ADDED Viewed

@@ -0,0 +1,94 @@
+# coding/llm_sanitizer/python_sanitizer.py
+from __future__ import annotations
+import ast, re
+from typing import List
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
+from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
+class PythonStandardizer(CodeStandardizer):
+    language = "python"
+    def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult:
+        notes: List[str] = []
+        code = extract_code_block(raw, prefer_langs=("python","py"))
+        code = normalize_whitespace(code)
+        code = re.sub(r"^```.*?\n|\n```$", "", code, flags=re.DOTALL)
+        try:
+            tree = ast.parse(code)
+        except SyntaxError as e:
+            notes.append(f"parse failed: {e}; returning raw after whitespace normalize")
+            return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)
+        fn_names = [n.name for n in tree.body if isinstance(n, ast.FunctionDef)]
+        cls_nodes = [n for n in tree.body if isinstance(n, ast.ClassDef)]
+        has_entry_top = schema.entry_point in fn_names
+        if has_entry_top:
+            notes.append(f"top-level function '{schema.entry_point}' found")
+            cleaned = maybe_black(code)
+            return NormalizeResult(files={schema.file_name: cleaned}, notes="\n".join(notes), ok=True)
+        if schema.prefer_rename and len(fn_names) == 1:
+            old = fn_names[0]
+            notes.append(f"renaming single function '{old}' -> '{schema.entry_point}'")
+            class Renamer(ast.NodeTransformer):
+                def visit_FunctionDef(self, node: ast.FunctionDef):
+                    if node.name == old:
+                        node.name = schema.entry_point
+                    return self.generic_visit(node)
+            tree2 = Renamer().visit(tree)
+            ast.fix_missing_locations(tree2)
+            try:
+                new_code = ast.unparse(tree2)
+            except Exception:
+                new_code = code.replace(f"def {old}(", f"def {schema.entry_point}(")
+            new_code = maybe_black(new_code)
+            return NormalizeResult(files={schema.file_name: new_code}, notes="\n".join(notes), ok=True)
+        for cls in cls_nodes:
+            method_names = [n.name for n in cls.body if isinstance(n, ast.FunctionDef)]
+            if schema.entry_point in method_names:
+                notes.append(f"found method {cls.name}.{schema.entry_point}; adding thin adapter")
+                adapter = (
+                    f"\n\ndef {schema.entry_point}(*args, **kwargs):\n"
+                    f"    return {cls.name}().{schema.entry_point}(*args, **kwargs)\n"
+                )
+                final = code + adapter
+                final = maybe_black(final)
+                return NormalizeResult(files={schema.file_name: final}, notes="\n".join(notes), ok=True)
+        candidates = [n for n in fn_names if n in {"solve","solution","func","function","answer"}]
+        if candidates:
+            old = candidates[0]
+            notes.append(f"renaming fallback '{old}' -> '{schema.entry_point}'")
+            try:
+                class Renamer(ast.NodeTransformer):
+                    def visit_FunctionDef(self, node: ast.FunctionDef):
+                        if node.name == old: node.name = schema.entry_point
+                        return self.generic_visit(node)
+                tree2 = Renamer().visit(tree); ast.fix_missing_locations(tree2)
+                new_code = ast.unparse(tree2)
+            except Exception:
+                new_code = code.replace(f"def {old}(", f"def {schema.entry_point}(")
+            new_code = maybe_black(new_code)
+            return NormalizeResult(files={schema.file_name: new_code}, notes="\n".join(notes), ok=True)
+        if schema.allow_wrapper:
+            notes.append("no entry found; appending dynamic-dispatch adapter to call first callable")
+            adapter = (
+                f"\n\ndef {schema.entry_point}(*args, **kwargs):\n"
+                f"    # fallback: try first callable in module\n"
+                f"    import inspect\n"
+                f"    for _name, _obj in globals().items():\n"
+                f"        if callable(_obj) and _name not in ('{schema.entry_point}',):\n"
+                f"            try:\n"
+                f"                return _obj(*args, **kwargs)\n"
+                f"            except TypeError:\n"
+                f"                continue\n"
+                f"    raise NameError('No suitable function for entry point')\n"
+            )
+            final = maybe_black(code + adapter)
+            return NormalizeResult(files={schema.file_name: final}, notes="\n".join(notes), ok=True)
+        return NormalizeResult(files={schema.file_name: code}, notes="\n".join(notes), ok=False)

wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py ADDED Viewed

@@ -0,0 +1,126 @@
+# coding/llm_sanitizer/util.py
+from __future__ import annotations
+import re
+from textwrap import dedent
+_FENCE_RE = re.compile(
+    r"```(?P<lang>[a-zA-Z0-9_+-]*)\s*\n(?P<code>.*?)(?:```|$)", re.DOTALL
+)
+def extract_code_block(raw: str, prefer_langs=("python","py","cpp","c++","java"), strict: bool = False) -> str:
+    """
+    Return the best-looking fenced code block; else the raw text.
+    Args:
+        raw:
+            The raw text possibly containing fenced code blocks.
+        prefer_langs:
+            Languages to prefer when selecting a code block.
+        strict:
+            If True, only return code from preferred languages. If no matching
+            code block found, strip markdown fences and return what looks like code.
+    Returns:
+        The extracted code block, or the raw text if no fenced blocks found.
+    Examples:
+        >>> extract_code_block("Here is some code:\\n```python\\ndef foo(): pass\\n```")
+        'def foo(): pass'
+        >>> extract_code_block("No code blocks here.")
+        'No code blocks here.'
+        >>> extract_code_block("Multiple:\\n```java\\nclass A {}\\n```\\n```python\\ndef f(): pass\\n```")
+        'def f(): pass'
+    """
+    matches = list(_FENCE_RE.finditer(raw))
+    if not matches:
+        return strip_triple_quotes(raw)
+    def score(m):
+        lang = (m.group("lang") or "").lower()
+        pref = 1 if lang in prefer_langs else 0
+        return (pref, len(m.group("code")))
+    if strict:
+        # Only consider blocks from preferred languages
+        preferred_matches = [m for m in matches if (m.group("lang") or "").lower() in prefer_langs]
+        if preferred_matches:
+            m = max(preferred_matches, key=lambda m: len(m.group("code")))
+            return m.group("code").strip()
+        # No preferred language found - try unlabeled code blocks
+        unlabeled = [m for m in matches if not m.group("lang")]
+        if unlabeled:
+            m = max(unlabeled, key=lambda m: len(m.group("code")))
+            return m.group("code").strip()
+        # Fall back to stripping all markdown and returning what's left
+        return strip_triple_quotes(raw)
+    m = max(matches, key=score)
+    return m.group("code").strip()
+def strip_triple_quotes(s: str) -> str:
+    """
+    If the string is wrapped in triple quotes, strip them.
+    Args:
+        s:
+            The input string.
+    Returns:
+        The string with triple quotes removed if they were present.
+    Examples:
+        >>> strip_triple_quotes('\"\"\"def foo(): pass\"\"\"')
+        'def foo(): pass'
+        >>> strip_triple_quotes("'''def foo(): pass'''")
+        'def foo(): pass'
+        >>> strip_triple_quotes('def foo(): pass')
+        'def foo(): pass'
+    """
+    s = s.strip()
+    if s.startswith('"""') and s.endswith('"""'):
+        return s[3:-3].strip()
+    if s.startswith("'''") and s.endswith("'''"):
+        return s[3:-3].strip()
+    return s
+def normalize_whitespace(code: str) -> str:
+    """
+    Normalize line endings to LF, dedent, and strip leading/trailing whitespace.
+    arguments:
+        code:
+            The input code string.
+    returns:
+        The normalized code string.
+    examples:
+        >>> normalize_whitespace("  def foo():\\n    pass  ")
+        'def foo():\\n    pass'
+        >>> normalize_whitespace("def foo():\\r\\n    pass\\r")
+        'def foo():\\n    pass'
+    """
+    code = code.replace("\r\n","\n").replace("\r","\n")
+    code = dedent(code).strip()
+    return code
+def maybe_black(code: str) -> str:
+    """
+    If Black is installed, format; otherwise return as-is.
+    arguments:
+        code:
+            The input Python code string.
+    returns:
+        The formatted code string if Black is available; else the original code.
+    examples:
+        >>> maybe_black("def foo():pass")
+        'def foo():\\n    pass\\n'
+    """
+    try:
+        import black
+        return black.format_str(code, mode=black.FileMode())
+    except Exception:
+        return code

wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+# coding/providers/core/atoms.py
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, Iterable, Protocol, Literal
+Language = Literal["python", "cpp", "java"]
+@dataclass(frozen=True)
+class CodingTask:
+    """A normalized task with language + harness files to be executed."""
+    language: Language
+    files: Dict[str, str]          # e.g., {"solution.py": "...", "tests.py": "..."} or C++/Java equivalents
+    options: Dict[str, object]     # e.g., {"cxx_std": "c++20", "java_main": "MainTest"}
+class Provider(Protocol):
+    """Dataset provider yields tasks (codegen or self-repair compatible)."""
+    name: str
+    def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]: ...

wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py ADDED Viewed

File without changes

wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py ADDED Viewed

@@ -0,0 +1,31 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Iterable, Protocol, Literal
+Language = Literal["python", "cpp", "java"]
+@dataclass(frozen=True)
+class CodingTask:
+    """
+    A normalized task with language + harness files to be executed.
+    attributes:
+        language:
+            The programming language of the task.
+        files:
+            A dictionary mapping filenames to their content. For example,
+            {"solution.py": "...", "tests.py": "..."} for Python tasks,
+            or equivalent files for C++/Java tasks.
+        options:
+            A dictionary of additional options that may be required for
+            execution. For example, {"cxx_std": "c++20"} for C++ tasks,
+            or {"java_main": "MainTest"} for Java tasks.
+    """
+    language: Language
+    files: dict[str, str]
+    options: dict[str, object]
+class Provider(Protocol):
+    """Dataset provider yields tasks (codegen or self-repair compatible)."""
+    name: str
+    def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]: ...

wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .provider import LiveCodeBenchProvider
+__all__ = ["LiveCodeBenchProvider"]