inferencebench-code 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inferencebench_code/__init__.py +12 -0
- inferencebench_code/benchmarks/humaneval-mini.yaml +15 -0
- inferencebench_code/benchmarks/mbpp-mini.yaml +15 -0
- inferencebench_code/datasets/humaneval-mini.jsonl +5 -0
- inferencebench_code/datasets/mbpp-mini.jsonl +5 -0
- inferencebench_code/plugin.py +532 -0
- inferencebench_code/py.typed +0 -0
- inferencebench_code/runner.py +144 -0
- inferencebench_code/schemas.py +93 -0
- inferencebench_code/scoring.py +79 -0
- inferencebench_code-0.0.2.dist-info/METADATA +68 -0
- inferencebench_code-0.0.2.dist-info/RECORD +14 -0
- inferencebench_code-0.0.2.dist-info/WHEEL +4 -0
- inferencebench_code-0.0.2.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""InferenceBench code-generation plugin."""
|
|
2
|
+
|
|
3
|
+
from inferencebench_code.plugin import EXPECTED_METRICS, CodeGenerationPlugin
|
|
4
|
+
from inferencebench_code.schemas import BenchmarkSpec, EngineKind, RunContext
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"EXPECTED_METRICS",
|
|
8
|
+
"BenchmarkSpec",
|
|
9
|
+
"CodeGenerationPlugin",
|
|
10
|
+
"EngineKind",
|
|
11
|
+
"RunContext",
|
|
12
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
benchmark_id: code.generation.humaneval-mini
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: Five stdlib-only Python tasks, pass@1 with a 5-second wall-clock timeout.
|
|
4
|
+
modality: code
|
|
5
|
+
kind: generation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-humaneval-mini
|
|
8
|
+
path: humaneval-mini.jsonl
|
|
9
|
+
slo_template: code.generation.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
language: python
|
|
13
|
+
scoring: pass_at_1
|
|
14
|
+
k: 1
|
|
15
|
+
timeout_s: 5.0
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
benchmark_id: code.generation.mbpp-mini
|
|
2
|
+
suite_version: 1.0.0
|
|
3
|
+
description: Mostly Basic Python Problems (MBPP)-style stdlib-only tasks, pass@1 with a 5-second wall-clock timeout.
|
|
4
|
+
modality: code
|
|
5
|
+
kind: generation
|
|
6
|
+
dataset:
|
|
7
|
+
id: builtin-mbpp-mini
|
|
8
|
+
path: mbpp-mini.jsonl
|
|
9
|
+
slo_template: code.generation.standard
|
|
10
|
+
warmup:
|
|
11
|
+
discard_runs: 0
|
|
12
|
+
language: python
|
|
13
|
+
scoring: pass_at_1
|
|
14
|
+
k: 1
|
|
15
|
+
timeout_s: 5.0
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"task_id": "add_two_numbers", "entry_point": "add", "prompt": "def add(a, b):\n \"\"\"Return the sum of a and b.\"\"\"\n", "canonical_solution": "def add(a, b):\n return a + b\n", "tests": "assert add(1, 2) == 3\nassert add(0, 0) == 0\nassert add(-1, 1) == 0\nassert add(-5, -7) == -12\nassert add(100, 250) == 350\nassert add(2.5, 0.5) == 3.0\nassert add(-100, 100) == 0\n"}
|
|
2
|
+
{"task_id": "reverse_string", "entry_point": "reverse_string", "prompt": "def reverse_string(s):\n \"\"\"Return the reverse of the string s.\"\"\"\n", "canonical_solution": "def reverse_string(s):\n return s[::-1]\n", "tests": "assert reverse_string('') == ''\nassert reverse_string('a') == 'a'\nassert reverse_string('hello') == 'olleh'\nassert reverse_string('abcde') == 'edcba'\nassert reverse_string(' spaces ') == ' secaps '\nassert reverse_string('racecar') == 'racecar'\nassert reverse_string('AbC') == 'CbA'\n"}
|
|
3
|
+
{"task_id": "fibonacci_iter", "entry_point": "fib", "prompt": "def fib(n):\n \"\"\"Return the n-th Fibonacci number (0-indexed: fib(0)=0, fib(1)=1).\"\"\"\n", "canonical_solution": "def fib(n):\n a, b = 0, 1\n for _ in range(n):\n a, b = b, a + b\n return a\n", "tests": "assert fib(0) == 0\nassert fib(1) == 1\nassert fib(2) == 1\nassert fib(3) == 2\nassert fib(5) == 5\nassert fib(10) == 55\nassert fib(15) == 610\n"}
|
|
4
|
+
{"task_id": "count_vowels", "entry_point": "count_vowels", "prompt": "def count_vowels(s):\n \"\"\"Count the number of vowels (a, e, i, o, u) in the lowercase string s.\"\"\"\n", "canonical_solution": "def count_vowels(s):\n return sum(1 for c in s if c in 'aeiou')\n", "tests": "assert count_vowels('') == 0\nassert count_vowels('bcdfg') == 0\nassert count_vowels('aeiou') == 5\nassert count_vowels('hello') == 2\nassert count_vowels('python') == 1\nassert count_vowels('queue') == 4\nassert count_vowels('rhythm') == 0\n"}
|
|
5
|
+
{"task_id": "is_palindrome", "entry_point": "is_palindrome", "prompt": "def is_palindrome(s):\n \"\"\"Return True if s is a palindrome (case-insensitive). Empty string is a palindrome.\"\"\"\n", "canonical_solution": "def is_palindrome(s):\n t = s.lower()\n return t == t[::-1]\n", "tests": "assert is_palindrome('') is True\nassert is_palindrome('a') is True\nassert is_palindrome('racecar') is True\nassert is_palindrome('Racecar') is True\nassert is_palindrome('hello') is False\nassert is_palindrome('Level') is True\nassert is_palindrome('abba') is True\n"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
{"task_id": "mbpp-001", "prompt": "Write a function sum_list(items) that returns the sum of a list of integers.", "tests": "assert sum_list([1,2,3]) == 6\nassert sum_list([]) == 0\nassert sum_list([-1, -2, 3]) == 0\nassert sum_list([10, 20, 30, 40]) == 100\n", "canonical_solution": "def sum_list(items):\n return sum(items)\n", "entry_point": "sum_list"}
|
|
2
|
+
{"task_id": "mbpp-002", "prompt": "Write a function max_of_three(a, b, c) that returns the largest of three numbers.", "tests": "assert max_of_three(1, 2, 3) == 3\nassert max_of_three(5, 2, 4) == 5\nassert max_of_three(-1, -2, -3) == -1\nassert max_of_three(7, 7, 7) == 7\n", "canonical_solution": "def max_of_three(a, b, c):\n return max(a, b, c)\n", "entry_point": "max_of_three"}
|
|
3
|
+
{"task_id": "mbpp-003", "prompt": "Write a function count_evens(nums) that returns the count of even integers in the list nums.", "tests": "assert count_evens([1, 2, 3, 4]) == 2\nassert count_evens([]) == 0\nassert count_evens([2, 4, 6, 8]) == 4\nassert count_evens([1, 3, 5]) == 0\n", "canonical_solution": "def count_evens(nums):\n return sum(1 for n in nums if n % 2 == 0)\n", "entry_point": "count_evens"}
|
|
4
|
+
{"task_id": "mbpp-004", "prompt": "Write a function gcd(a, b) that returns the greatest common divisor of two non-negative integers.", "tests": "assert gcd(12, 18) == 6\nassert gcd(7, 5) == 1\nassert gcd(100, 75) == 25\nassert gcd(0, 9) == 9\n", "canonical_solution": "def gcd(a, b):\n while b:\n a, b = b, a % b\n return a\n", "entry_point": "gcd"}
|
|
5
|
+
{"task_id": "mbpp-005", "prompt": "Write a function unique_sorted(items) that returns a sorted list of the unique values in items.", "tests": "assert unique_sorted([3, 1, 2, 3, 1]) == [1, 2, 3]\nassert unique_sorted([]) == []\nassert unique_sorted([5]) == [5]\nassert unique_sorted([2, 2, 2, 2]) == [2]\n", "canonical_solution": "def unique_sorted(items):\n return sorted(set(items))\n", "entry_point": "unique_sorted"}
|
|
@@ -0,0 +1,532 @@
|
|
|
1
|
+
"""CodeGenerationPlugin — entry point for ``code.generation`` benchmarks.
|
|
2
|
+
|
|
3
|
+
HumanEval-style execution-based scoring: for each fixture row we send the
|
|
4
|
+
function-signature prompt to the model, extract Python code from the
|
|
5
|
+
response, execute it against the bundled unit tests in an isolated
|
|
6
|
+
subprocess, and aggregate per-task pass/fail into a ``pass_at_1`` headline.
|
|
7
|
+
|
|
8
|
+
Structural twin of :class:`inferencebench_quality.plugin.LLMQualityPlugin`:
|
|
9
|
+
plugin contract, ModelClient wiring, signing flow, and envelope shape all
|
|
10
|
+
mirror that module so cross-plugin tooling (summary / compare / diff /
|
|
11
|
+
audit) treats code envelopes the same as quality envelopes.
|
|
12
|
+
|
|
13
|
+
**Safety:** :func:`run` prints a yellow banner on every invocation as a
|
|
14
|
+
reminder that model output is executed locally. See the package README for
|
|
15
|
+
the full safety boundary; ``runner.py`` is best-effort defence-in-depth, not
|
|
16
|
+
a sandbox.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import hashlib
|
|
22
|
+
import json
|
|
23
|
+
import math
|
|
24
|
+
import os
|
|
25
|
+
import sys
|
|
26
|
+
import time
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
import yaml
|
|
30
|
+
|
|
31
|
+
from inferencebench.envelope import (
|
|
32
|
+
DatasetSpec as EnvDatasetSpec,
|
|
33
|
+
)
|
|
34
|
+
from inferencebench.envelope import (
|
|
35
|
+
EngineConfig,
|
|
36
|
+
Envelope,
|
|
37
|
+
EnvelopeBuilder,
|
|
38
|
+
ModelConfig,
|
|
39
|
+
Quantization,
|
|
40
|
+
SigningMode,
|
|
41
|
+
sign_envelope,
|
|
42
|
+
)
|
|
43
|
+
from inferencebench.harness import (
|
|
44
|
+
CompletionResult,
|
|
45
|
+
ModelClient,
|
|
46
|
+
Sample,
|
|
47
|
+
collect_hardware_fingerprint,
|
|
48
|
+
collect_software_provenance,
|
|
49
|
+
)
|
|
50
|
+
from inferencebench.harness.metrics import EnergyReport, Percentiles, TelemetryWindow
|
|
51
|
+
from inferencebench_code.runner import RunResult, run_unit_tests
|
|
52
|
+
from inferencebench_code.schemas import BenchmarkSpec, EngineKind, RunContext
|
|
53
|
+
from inferencebench_code.scoring import extract_python_code
|
|
54
|
+
|
|
55
|
+
_SAFETY_BANNER = (
|
|
56
|
+
"\033[33m" # yellow
|
|
57
|
+
"WARNING: code.generation executes model-generated Python locally. "
|
|
58
|
+
"The runner is best-effort (python -I subprocess with timeout + forbidden-import "
|
|
59
|
+
"pre-scan) — NOT a sandbox. Use only with trusted models and bundled fixtures."
|
|
60
|
+
"\033[0m"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Engines that require ``base_url`` (self-hosted OpenAI-compatible servers).
|
|
64
|
+
_SELF_HOSTED_ENGINES = frozenset({EngineKind.VLLM, EngineKind.SGLANG})
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _fixtures_cache_root() -> Path:
|
|
68
|
+
"""Resolve the bench-fixtures cache root for ``fixtures://`` dataset URIs."""
|
|
69
|
+
override = os.environ.get("BENCH_FIXTURES_ROOT")
|
|
70
|
+
if override:
|
|
71
|
+
return Path(override)
|
|
72
|
+
return Path.home() / ".cache" / "inferencebench" / "fixtures"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _json_num(v: float) -> str:
|
|
76
|
+
"""JSON-safe numeric encoder: NaN/inf become null."""
|
|
77
|
+
if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
|
|
78
|
+
return "null"
|
|
79
|
+
return repr(v)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _compute_fixture_hash(items: list[dict[str, str]]) -> str:
|
|
83
|
+
"""SHA-256 over the canonical-JSON-encoded fixture rows."""
|
|
84
|
+
canonical = json.dumps(items, sort_keys=True, separators=(",", ":"))
|
|
85
|
+
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _build_client(context: RunContext, *, timeout_s: float = 60.0) -> ModelClient:
|
|
89
|
+
"""Build a :class:`ModelClient` from the run context.
|
|
90
|
+
|
|
91
|
+
Same routing convention as the quality plugin: self-hosted OpenAI-
|
|
92
|
+
compatible engines (vLLM, SGLang) get the ``openai/`` LiteLLM prefix
|
|
93
|
+
applied exactly once; provider-hosted engines (OpenAI, Anthropic)
|
|
94
|
+
leave the model id untouched.
|
|
95
|
+
"""
|
|
96
|
+
model_id = context.model_id
|
|
97
|
+
api_key: str | None
|
|
98
|
+
if context.engine_kind in _SELF_HOSTED_ENGINES:
|
|
99
|
+
if model_id.startswith("openai/"):
|
|
100
|
+
model_id = model_id[len("openai/") :]
|
|
101
|
+
model_id = f"openai/{model_id}"
|
|
102
|
+
api_key = context.api_key or "EMPTY"
|
|
103
|
+
else:
|
|
104
|
+
api_key = context.api_key or None
|
|
105
|
+
return ModelClient(
|
|
106
|
+
model=model_id,
|
|
107
|
+
api_key=api_key,
|
|
108
|
+
base_url=context.base_url or None,
|
|
109
|
+
timeout_s=timeout_s,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _ensure_signature_present(prompt: str, generated: str) -> str:
|
|
114
|
+
"""Prepend ``prompt`` (function signature + docstring) to ``generated`` if missing.
|
|
115
|
+
|
|
116
|
+
Some models reply with the function body only; we glue the signature back
|
|
117
|
+
on so the subprocess can call the named entry point. We detect "missing
|
|
118
|
+
signature" by looking for the first non-empty line of ``prompt`` (the
|
|
119
|
+
``def ...`` line) in ``generated``; if it isn't present we prepend.
|
|
120
|
+
"""
|
|
121
|
+
head = ""
|
|
122
|
+
for line in prompt.splitlines():
|
|
123
|
+
stripped = line.strip()
|
|
124
|
+
if stripped.startswith("def "):
|
|
125
|
+
head = stripped
|
|
126
|
+
break
|
|
127
|
+
if head and head not in generated:
|
|
128
|
+
return prompt.rstrip() + "\n" + generated
|
|
129
|
+
return generated
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# Metrics this plugin is expected to emit. Consumed by ``bench coverage``.
|
|
133
|
+
EXPECTED_METRICS: tuple[str, ...] = (
|
|
134
|
+
"pass_at_1",
|
|
135
|
+
"pass_at_1_p05",
|
|
136
|
+
"pass_at_1_p50",
|
|
137
|
+
"pass_at_1_p95",
|
|
138
|
+
"timeout_rate",
|
|
139
|
+
"ok_rate",
|
|
140
|
+
"n_samples",
|
|
141
|
+
"ttft_p50_ms",
|
|
142
|
+
"total_p50_ms",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class CodeGenerationPlugin:
|
|
147
|
+
"""Plugin entry point. Registered via ``inferencebench.plugins`` entrypoint group."""
|
|
148
|
+
|
|
149
|
+
suite_id = "code.generation"
|
|
150
|
+
version = "0.0.2"
|
|
151
|
+
description = (
|
|
152
|
+
"Code-generation benchmarks (HumanEval-style execution-based scoring; "
|
|
153
|
+
"executes model output locally — see README for the safety boundary)."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# ----------------------------------------------------------- benchmarks #
|
|
157
|
+
def list_benchmarks(self) -> list[BenchmarkSpec]:
|
|
158
|
+
bench_dir = self._benchmarks_dir()
|
|
159
|
+
specs: list[BenchmarkSpec] = []
|
|
160
|
+
if not bench_dir.exists():
|
|
161
|
+
return specs
|
|
162
|
+
for yml in sorted(bench_dir.glob("*.yaml")):
|
|
163
|
+
specs.append(self._load_yaml(yml))
|
|
164
|
+
return specs
|
|
165
|
+
|
|
166
|
+
def get_benchmark(self, benchmark_id: str) -> BenchmarkSpec:
|
|
167
|
+
for spec in self.list_benchmarks():
|
|
168
|
+
if spec.benchmark_id == benchmark_id:
|
|
169
|
+
return spec
|
|
170
|
+
msg = f"benchmark_id not found: {benchmark_id}"
|
|
171
|
+
raise KeyError(msg)
|
|
172
|
+
|
|
173
|
+
# ------------------------------------------------------------- validate #
|
|
174
|
+
def validate(self, spec: BenchmarkSpec, context: RunContext) -> list[str]:
|
|
175
|
+
warnings: list[str] = []
|
|
176
|
+
if not context.model_id:
|
|
177
|
+
warnings.append("model_id is empty")
|
|
178
|
+
if context.engine_kind in _SELF_HOSTED_ENGINES and not context.base_url:
|
|
179
|
+
warnings.append(
|
|
180
|
+
f"{context.engine_kind.value} needs base_url (e.g. http://localhost:8000/v1)"
|
|
181
|
+
)
|
|
182
|
+
if not self._dataset_path(spec).exists():
|
|
183
|
+
warnings.append(f"fixture not found: {spec.dataset.path}")
|
|
184
|
+
return warnings
|
|
185
|
+
|
|
186
|
+
# ------------------------------------------------------------------ run #
|
|
187
|
+
def run(self, spec: BenchmarkSpec, context: RunContext) -> Envelope:
|
|
188
|
+
"""Execute the benchmark and return a SIGNED envelope.
|
|
189
|
+
|
|
190
|
+
Prints the safety banner to stderr on every call.
|
|
191
|
+
"""
|
|
192
|
+
print(_SAFETY_BANNER, file=sys.stderr, flush=True)
|
|
193
|
+
|
|
194
|
+
client = _build_client(context)
|
|
195
|
+
items = self._load_fixture(spec)
|
|
196
|
+
fixture_hash = _compute_fixture_hash(items)
|
|
197
|
+
|
|
198
|
+
samples, passed_flags, timeout_flags, telemetry = self._score_items(
|
|
199
|
+
client, items, timeout_s=spec.timeout_s
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Best-effort diagnostic dump — never blocks the run on I/O errors.
|
|
203
|
+
self._dump_samples(context, samples)
|
|
204
|
+
|
|
205
|
+
envelope = self._build_envelope(
|
|
206
|
+
spec,
|
|
207
|
+
context,
|
|
208
|
+
samples=samples,
|
|
209
|
+
passed_flags=passed_flags,
|
|
210
|
+
timeout_flags=timeout_flags,
|
|
211
|
+
dataset_hash=fixture_hash,
|
|
212
|
+
energy=telemetry.summarise(samples),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
signing_mode = context.extra.get("signing_mode", "dev")
|
|
216
|
+
dev_key_path = context.extra.get("dev_key_path")
|
|
217
|
+
if signing_mode == "dev":
|
|
218
|
+
if not dev_key_path:
|
|
219
|
+
msg = "dev signing requires context.extra['dev_key_path']"
|
|
220
|
+
raise ValueError(msg)
|
|
221
|
+
return sign_envelope(
|
|
222
|
+
envelope,
|
|
223
|
+
mode=SigningMode.DEV,
|
|
224
|
+
dev_key_path=Path(str(dev_key_path)),
|
|
225
|
+
)
|
|
226
|
+
return sign_envelope(envelope, mode=SigningMode.KEYLESS)
|
|
227
|
+
|
|
228
|
+
# -------------------------------------------------------- core scoring #
|
|
229
|
+
def _score_items(
|
|
230
|
+
self,
|
|
231
|
+
client: ModelClient,
|
|
232
|
+
items: list[dict[str, str]],
|
|
233
|
+
*,
|
|
234
|
+
timeout_s: float,
|
|
235
|
+
) -> tuple[list[Sample], list[bool], list[bool], TelemetryWindow]:
|
|
236
|
+
"""Iterate fixture items sequentially, scoring each model response.
|
|
237
|
+
|
|
238
|
+
For each fixture row:
|
|
239
|
+
|
|
240
|
+
1. Send the prompt to the model (max_tokens=512, streamed).
|
|
241
|
+
2. Extract a Python block from the response.
|
|
242
|
+
3. Glue the function signature back on if the model omitted it.
|
|
243
|
+
4. Execute the result against the row's unit tests with a
|
|
244
|
+
``timeout_s`` wall clock.
|
|
245
|
+
|
|
246
|
+
Records a :class:`Sample` per row (including ``passed``,
|
|
247
|
+
``duration_s``, ``timeout``, and an ``error_summary`` string) and
|
|
248
|
+
returns parallel lists of per-row ``passed`` and ``timeout`` flags
|
|
249
|
+
for the aggregator, plus the :class:`TelemetryWindow` for energy
|
|
250
|
+
accounting.
|
|
251
|
+
"""
|
|
252
|
+
samples: list[Sample] = []
|
|
253
|
+
passed_flags: list[bool] = []
|
|
254
|
+
timeout_flags: list[bool] = []
|
|
255
|
+
|
|
256
|
+
telemetry = TelemetryWindow()
|
|
257
|
+
with telemetry:
|
|
258
|
+
for idx, item in enumerate(items):
|
|
259
|
+
prompt = item["prompt"]
|
|
260
|
+
tests = item["tests"]
|
|
261
|
+
t_arrival = time.perf_counter() * 1000.0
|
|
262
|
+
try:
|
|
263
|
+
result: CompletionResult = client.complete(
|
|
264
|
+
prompt, stream=True, max_tokens=512
|
|
265
|
+
)
|
|
266
|
+
except Exception as exc:
|
|
267
|
+
samples.append(
|
|
268
|
+
Sample(
|
|
269
|
+
request_idx=idx,
|
|
270
|
+
arrival_ms=t_arrival,
|
|
271
|
+
start_ms=t_arrival,
|
|
272
|
+
ttft_ms=float("nan"),
|
|
273
|
+
total_ms=float("nan"),
|
|
274
|
+
tpot_ms=float("nan"),
|
|
275
|
+
tokens_in=0,
|
|
276
|
+
tokens_out=0,
|
|
277
|
+
cost_usd=0.0,
|
|
278
|
+
finish_reason="error",
|
|
279
|
+
ok=False,
|
|
280
|
+
error=str(exc),
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
passed_flags.append(False)
|
|
284
|
+
timeout_flags.append(False)
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
extracted = extract_python_code(result.text)
|
|
288
|
+
solution = _ensure_signature_present(prompt, extracted)
|
|
289
|
+
run_result: RunResult = run_unit_tests(solution, tests, timeout_s=timeout_s)
|
|
290
|
+
passed_flags.append(run_result.passed)
|
|
291
|
+
timeout_flags.append(run_result.timeout)
|
|
292
|
+
|
|
293
|
+
error_summary = self._summarize_error(run_result)
|
|
294
|
+
sample_extra: dict[str, str | int | float | bool] = {
|
|
295
|
+
"task_id": item.get("task_id", ""),
|
|
296
|
+
"passed": run_result.passed,
|
|
297
|
+
"duration_s": run_result.duration_s,
|
|
298
|
+
"timeout_flag": run_result.timeout,
|
|
299
|
+
}
|
|
300
|
+
if error_summary:
|
|
301
|
+
sample_extra["error_summary"] = error_summary
|
|
302
|
+
|
|
303
|
+
samples.append(
|
|
304
|
+
Sample(
|
|
305
|
+
request_idx=idx,
|
|
306
|
+
arrival_ms=t_arrival,
|
|
307
|
+
start_ms=t_arrival,
|
|
308
|
+
ttft_ms=result.ttft_ms,
|
|
309
|
+
total_ms=result.total_ms,
|
|
310
|
+
tpot_ms=result.tpot_ms,
|
|
311
|
+
tokens_in=result.tokens_in,
|
|
312
|
+
tokens_out=result.tokens_out,
|
|
313
|
+
cost_usd=result.cost_usd,
|
|
314
|
+
finish_reason=result.finish_reason,
|
|
315
|
+
ok=True,
|
|
316
|
+
extra=sample_extra,
|
|
317
|
+
)
|
|
318
|
+
)
|
|
319
|
+
return samples, passed_flags, timeout_flags, telemetry
|
|
320
|
+
|
|
321
|
+
@staticmethod
|
|
322
|
+
def _summarize_error(result: RunResult) -> str:
|
|
323
|
+
"""Distil ``RunResult`` into a short ``error_summary`` string.
|
|
324
|
+
|
|
325
|
+
Empty when the run passed. Otherwise: ``"timeout"`` for wall-clock
|
|
326
|
+
kills, ``"forbidden_import: <name>"`` for pre-scan refusals (the
|
|
327
|
+
stderr is already shaped that way by the runner), and the last
|
|
328
|
+
line of stderr (typically the AssertionError or Exception class)
|
|
329
|
+
for normal failures.
|
|
330
|
+
"""
|
|
331
|
+
if result.passed:
|
|
332
|
+
return ""
|
|
333
|
+
if result.timeout:
|
|
334
|
+
return "timeout"
|
|
335
|
+
if result.stderr.startswith("forbidden_import"):
|
|
336
|
+
return result.stderr.split("\n", 1)[0]
|
|
337
|
+
last_lines = [line.strip() for line in result.stderr.strip().splitlines() if line.strip()]
|
|
338
|
+
return last_lines[-1] if last_lines else "exit_nonzero"
|
|
339
|
+
|
|
340
|
+
# ------------------------------------------------------------ samples #
|
|
341
|
+
def _dump_samples(self, context: RunContext, samples: list[Sample]) -> None:
|
|
342
|
+
"""Write per-task samples (incl. pass flag + duration) to ``samples-<ts>.jsonl``.
|
|
343
|
+
|
|
344
|
+
Mirrors the llm-quality plugin's diagnostic dump — failures here
|
|
345
|
+
never block the run.
|
|
346
|
+
"""
|
|
347
|
+
try:
|
|
348
|
+
out_dir = Path(context.output_dir)
|
|
349
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
350
|
+
ts = int(time.time())
|
|
351
|
+
path = out_dir / f"samples-{ts}.jsonl"
|
|
352
|
+
with path.open("w", encoding="utf-8") as fp:
|
|
353
|
+
for s in samples:
|
|
354
|
+
extra = s.extra or {}
|
|
355
|
+
passed = bool(extra.get("passed", False))
|
|
356
|
+
duration = extra.get("duration_s")
|
|
357
|
+
timeout_flag = bool(extra.get("timeout_flag", False))
|
|
358
|
+
duration_part = (
|
|
359
|
+
',"duration_s":' + _json_num(float(duration))
|
|
360
|
+
if isinstance(duration, (int, float))
|
|
361
|
+
else ""
|
|
362
|
+
)
|
|
363
|
+
fp.write(
|
|
364
|
+
'{"request_idx":'
|
|
365
|
+
+ str(s.request_idx)
|
|
366
|
+
+ ',"ok":'
|
|
367
|
+
+ ("true" if s.ok else "false")
|
|
368
|
+
+ ',"passed":'
|
|
369
|
+
+ ("true" if passed else "false")
|
|
370
|
+
+ ',"timeout":'
|
|
371
|
+
+ ("true" if timeout_flag else "false")
|
|
372
|
+
+ ',"ttft_ms":'
|
|
373
|
+
+ _json_num(s.ttft_ms)
|
|
374
|
+
+ ',"total_ms":'
|
|
375
|
+
+ _json_num(s.total_ms)
|
|
376
|
+
+ ',"tokens_out":'
|
|
377
|
+
+ str(s.tokens_out)
|
|
378
|
+
+ duration_part
|
|
379
|
+
+ ',"finish_reason":"'
|
|
380
|
+
+ (s.finish_reason or "")
|
|
381
|
+
+ '"'
|
|
382
|
+
+ "}\n"
|
|
383
|
+
)
|
|
384
|
+
except OSError:
|
|
385
|
+
pass # diagnostics-only — never block the run
|
|
386
|
+
|
|
387
|
+
# ---------------------------------------------------------- file paths #
|
|
388
|
+
def _benchmarks_dir(self) -> Path:
|
|
389
|
+
return Path(__file__).parent / "benchmarks"
|
|
390
|
+
|
|
391
|
+
def _datasets_dir(self) -> Path:
|
|
392
|
+
return Path(__file__).parent / "datasets"
|
|
393
|
+
|
|
394
|
+
def _dataset_path(self, spec: BenchmarkSpec) -> Path:
|
|
395
|
+
raw = spec.dataset.path
|
|
396
|
+
if raw.startswith("fixtures://"):
|
|
397
|
+
return _fixtures_cache_root() / f"{raw[len('fixtures://') :]}.jsonl"
|
|
398
|
+
return self._datasets_dir() / raw
|
|
399
|
+
|
|
400
|
+
def _load_yaml(self, path: Path) -> BenchmarkSpec:
|
|
401
|
+
raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
402
|
+
return BenchmarkSpec.model_validate(raw)
|
|
403
|
+
|
|
404
|
+
def _load_fixture(self, spec: BenchmarkSpec) -> list[dict[str, str]]:
|
|
405
|
+
path = self._dataset_path(spec)
|
|
406
|
+
if not path.exists():
|
|
407
|
+
if spec.dataset.path.startswith("fixtures://"):
|
|
408
|
+
key = spec.dataset.path[len("fixtures://") :]
|
|
409
|
+
msg = f"fixture not cached: {path}. Run `bench fixtures fetch {key}` first."
|
|
410
|
+
raise FileNotFoundError(msg)
|
|
411
|
+
msg = f"fixture not found: {path}"
|
|
412
|
+
raise FileNotFoundError(msg)
|
|
413
|
+
items: list[dict[str, str]] = []
|
|
414
|
+
with path.open("r", encoding="utf-8") as fp:
|
|
415
|
+
for line in fp:
|
|
416
|
+
line = line.strip()
|
|
417
|
+
if not line:
|
|
418
|
+
continue
|
|
419
|
+
obj = json.loads(line)
|
|
420
|
+
if not isinstance(obj, dict):
|
|
421
|
+
continue
|
|
422
|
+
if "task_id" not in obj or "prompt" not in obj or "tests" not in obj:
|
|
423
|
+
continue
|
|
424
|
+
items.append(
|
|
425
|
+
{
|
|
426
|
+
"task_id": str(obj["task_id"]),
|
|
427
|
+
"prompt": str(obj["prompt"]),
|
|
428
|
+
"tests": str(obj["tests"]),
|
|
429
|
+
"canonical_solution": str(obj.get("canonical_solution", "")),
|
|
430
|
+
"entry_point": str(obj.get("entry_point", "")),
|
|
431
|
+
}
|
|
432
|
+
)
|
|
433
|
+
if not items:
|
|
434
|
+
msg = f"fixture is empty: {path}"
|
|
435
|
+
raise ValueError(msg)
|
|
436
|
+
return items
|
|
437
|
+
|
|
438
|
+
# ---------------------------------------------------------- envelope #
|
|
439
|
+
def _build_envelope(
|
|
440
|
+
self,
|
|
441
|
+
spec: BenchmarkSpec,
|
|
442
|
+
context: RunContext,
|
|
443
|
+
*,
|
|
444
|
+
samples: list[Sample],
|
|
445
|
+
passed_flags: list[bool],
|
|
446
|
+
timeout_flags: list[bool],
|
|
447
|
+
dataset_hash: str,
|
|
448
|
+
energy: EnergyReport | None = None,
|
|
449
|
+
) -> Envelope:
|
|
450
|
+
hw = collect_hardware_fingerprint()
|
|
451
|
+
sw = collect_software_provenance()
|
|
452
|
+
|
|
453
|
+
metrics: dict[str, float | int | str | None] = {}
|
|
454
|
+
|
|
455
|
+
ok_samples = [s for s in samples if s.ok]
|
|
456
|
+
n_total = len(samples)
|
|
457
|
+
metrics["n_samples"] = float(n_total)
|
|
458
|
+
metrics["n_ok"] = float(len(ok_samples))
|
|
459
|
+
metrics["ok_rate"] = float(len(ok_samples)) / float(n_total) if n_total else 0.0
|
|
460
|
+
|
|
461
|
+
if passed_flags:
|
|
462
|
+
scores = [1.0 if p else 0.0 for p in passed_flags]
|
|
463
|
+
mean_pass = sum(scores) / len(scores)
|
|
464
|
+
metrics["pass_at_1"] = mean_pass
|
|
465
|
+
if len(scores) >= 2:
|
|
466
|
+
pcts = Percentiles(scores, percentiles=(5.0, 50.0, 95.0))
|
|
467
|
+
metrics["pass_at_1_p05"] = pcts.p5
|
|
468
|
+
metrics["pass_at_1_p50"] = pcts.p50
|
|
469
|
+
metrics["pass_at_1_p95"] = pcts.p95
|
|
470
|
+
else:
|
|
471
|
+
metrics["pass_at_1_p05"] = mean_pass
|
|
472
|
+
metrics["pass_at_1_p50"] = mean_pass
|
|
473
|
+
metrics["pass_at_1_p95"] = mean_pass
|
|
474
|
+
|
|
475
|
+
if timeout_flags:
|
|
476
|
+
metrics["timeout_rate"] = float(sum(timeout_flags)) / float(len(timeout_flags))
|
|
477
|
+
|
|
478
|
+
# Latency aggregates — useful for "quality at what cost" comparisons.
|
|
479
|
+
ttft_vals = [s.ttft_ms for s in ok_samples if math.isfinite(s.ttft_ms)]
|
|
480
|
+
total_vals = [s.total_ms for s in ok_samples if math.isfinite(s.total_ms)]
|
|
481
|
+
if ttft_vals:
|
|
482
|
+
metrics["ttft_p50_ms"] = Percentiles(ttft_vals).p50
|
|
483
|
+
if total_vals:
|
|
484
|
+
metrics["total_p50_ms"] = Percentiles(total_vals).p50
|
|
485
|
+
|
|
486
|
+
# Energy / power summary from telemetry (None on plugins that haven't
|
|
487
|
+
# threaded a TelemetryWindow through yet). Mirrors llm-inference.
|
|
488
|
+
if energy is not None:
|
|
489
|
+
if energy.gpu_power_avg_w > 0:
|
|
490
|
+
metrics["power_avg_w"] = energy.gpu_power_avg_w
|
|
491
|
+
metrics["power_peak_w"] = energy.gpu_power_peak_w
|
|
492
|
+
if energy.total_energy_joules > 0:
|
|
493
|
+
metrics["energy_joules_total"] = energy.total_energy_joules
|
|
494
|
+
if energy.joules_per_token == energy.joules_per_token: # not NaN
|
|
495
|
+
metrics["joules_per_token"] = energy.joules_per_token
|
|
496
|
+
|
|
497
|
+
tokens_out_total = sum(s.tokens_out for s in ok_samples)
|
|
498
|
+
if tokens_out_total:
|
|
499
|
+
metrics["tokens_out_total"] = float(tokens_out_total)
|
|
500
|
+
|
|
501
|
+
cost_total = sum(s.cost_usd for s in ok_samples)
|
|
502
|
+
if tokens_out_total and cost_total > 0:
|
|
503
|
+
metrics["cost_usd_per_million_tokens"] = (cost_total / tokens_out_total) * 1e6
|
|
504
|
+
metrics["cost_source"] = "provider"
|
|
505
|
+
|
|
506
|
+
builder = EnvelopeBuilder(
|
|
507
|
+
suite_id=spec.benchmark_id,
|
|
508
|
+
suite_version=spec.suite_version,
|
|
509
|
+
model=ModelConfig(
|
|
510
|
+
id=context.model_id,
|
|
511
|
+
revision=context.model_revision,
|
|
512
|
+
provider=context.engine_kind.value,
|
|
513
|
+
endpoint_hash="0" * 64,
|
|
514
|
+
),
|
|
515
|
+
engine=EngineConfig(
|
|
516
|
+
name=context.engine_kind.value,
|
|
517
|
+
version=context.engine_version or "unknown",
|
|
518
|
+
config_hash="0" * 64,
|
|
519
|
+
),
|
|
520
|
+
hardware_fingerprint=hw,
|
|
521
|
+
software_provenance=sw,
|
|
522
|
+
dataset=EnvDatasetSpec(id=spec.dataset.id, hash=dataset_hash),
|
|
523
|
+
seed=0,
|
|
524
|
+
quantization=(
|
|
525
|
+
Quantization(format=context.quantization_format)
|
|
526
|
+
if context.quantization_format
|
|
527
|
+
else None
|
|
528
|
+
),
|
|
529
|
+
metrics=metrics,
|
|
530
|
+
slo_template=spec.slo_template,
|
|
531
|
+
)
|
|
532
|
+
return builder.build()
|
|
File without changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Subprocess-based unit-test runner for the code-generation plugin.
|
|
2
|
+
|
|
3
|
+
Executes a model-generated Python solution alongside fixture unit tests in
|
|
4
|
+
an isolated ``python -I`` subprocess with a wall-clock timeout. **This is
|
|
5
|
+
not a real sandbox.** See ``README.md`` for the safety boundary; the
|
|
6
|
+
shortlist of forbidden-import heuristics here is defence-in-depth, not
|
|
7
|
+
defence-in-full.
|
|
8
|
+
|
|
9
|
+
Each :func:`run_unit_tests` invocation writes solution + tests to a
|
|
10
|
+
temporary file, runs it under ``subprocess.run`` with the supplied
|
|
11
|
+
``timeout_s``, captures stdout/stderr, and returns a :class:`RunResult`.
|
|
12
|
+
The temp file is always deleted via ``try/finally`` even on timeout or
|
|
13
|
+
unhandled errors.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import subprocess
|
|
20
|
+
import sys
|
|
21
|
+
import tempfile
|
|
22
|
+
import time
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
|
|
25
|
+
# Substrings that immediately disqualify a solution. Cheap pre-scan, not
|
|
26
|
+
# a parser — there are dozens of ways around it (eval, __import__, etc).
|
|
27
|
+
# That's fine; this is one of several layers, and the bundled fixtures
|
|
28
|
+
# never exercise stdlib edges.
|
|
29
|
+
_FORBIDDEN_IMPORTS: tuple[str, ...] = (
|
|
30
|
+
"subprocess",
|
|
31
|
+
"os.system",
|
|
32
|
+
"socket",
|
|
33
|
+
"urllib",
|
|
34
|
+
"multiprocessing",
|
|
35
|
+
"ctypes",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True, slots=True)
|
|
40
|
+
class RunResult:
|
|
41
|
+
"""Outcome of one subprocess execution.
|
|
42
|
+
|
|
43
|
+
``passed`` is True only when the subprocess exited 0 within the wall
|
|
44
|
+
clock. ``timeout`` is True when ``subprocess.TimeoutExpired`` fired
|
|
45
|
+
(in which case ``duration_s`` reflects the timeout, not the real
|
|
46
|
+
wall time). ``stdout`` and ``stderr`` are decoded UTF-8 strings.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
passed: bool
|
|
50
|
+
stdout: str
|
|
51
|
+
stderr: str
|
|
52
|
+
timeout: bool
|
|
53
|
+
duration_s: float
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _scan_forbidden(solution: str) -> str | None:
|
|
57
|
+
"""Return the first forbidden token found in ``solution`` or None.
|
|
58
|
+
|
|
59
|
+
Case-sensitive substring match; we accept the false-negative risk
|
|
60
|
+
in exchange for a vanishingly small false-positive rate.
|
|
61
|
+
"""
|
|
62
|
+
for token in _FORBIDDEN_IMPORTS:
|
|
63
|
+
if token in solution:
|
|
64
|
+
return token
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def run_unit_tests(
|
|
69
|
+
solution: str,
|
|
70
|
+
tests: str,
|
|
71
|
+
*,
|
|
72
|
+
timeout_s: float = 5.0,
|
|
73
|
+
) -> RunResult:
|
|
74
|
+
"""Execute ``solution + tests`` in an isolated subprocess and report pass/fail.
|
|
75
|
+
|
|
76
|
+
Arguments:
|
|
77
|
+
solution: Python code defining the function under test.
|
|
78
|
+
tests: Python code that imports / references the function (already
|
|
79
|
+
in scope because solution + tests are concatenated into one
|
|
80
|
+
file) and exercises it with ``assert`` statements.
|
|
81
|
+
timeout_s: Wall-clock budget. The subprocess is killed when it
|
|
82
|
+
elapses and :class:`RunResult` is returned with ``timeout=True``.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
:class:`RunResult` with ``passed``, captured streams, timeout flag,
|
|
86
|
+
and observed duration in seconds.
|
|
87
|
+
"""
|
|
88
|
+
forbidden = _scan_forbidden(solution)
|
|
89
|
+
if forbidden is not None:
|
|
90
|
+
return RunResult(
|
|
91
|
+
passed=False,
|
|
92
|
+
stdout="",
|
|
93
|
+
stderr=f"forbidden_import: solution references {forbidden!r}",
|
|
94
|
+
timeout=False,
|
|
95
|
+
duration_s=0.0,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
body = solution + "\n\n# --- tests ---\n" + tests + "\n"
|
|
99
|
+
# delete=False so we control the lifetime; the finally block removes it.
|
|
100
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
101
|
+
mode="w",
|
|
102
|
+
suffix=".py",
|
|
103
|
+
delete=False,
|
|
104
|
+
encoding="utf-8",
|
|
105
|
+
)
|
|
106
|
+
tmp_path = tmp.name
|
|
107
|
+
try:
|
|
108
|
+
tmp.write(body)
|
|
109
|
+
tmp.close()
|
|
110
|
+
start = time.perf_counter()
|
|
111
|
+
try:
|
|
112
|
+
completed = subprocess.run(
|
|
113
|
+
[sys.executable, "-I", tmp_path],
|
|
114
|
+
capture_output=True,
|
|
115
|
+
timeout=timeout_s,
|
|
116
|
+
check=False,
|
|
117
|
+
)
|
|
118
|
+
except subprocess.TimeoutExpired as exc:
|
|
119
|
+
duration = time.perf_counter() - start
|
|
120
|
+
stdout_bytes = exc.stdout or b""
|
|
121
|
+
stderr_bytes = exc.stderr or b""
|
|
122
|
+
return RunResult(
|
|
123
|
+
passed=False,
|
|
124
|
+
stdout=stdout_bytes.decode("utf-8", errors="replace"),
|
|
125
|
+
stderr=stderr_bytes.decode("utf-8", errors="replace"),
|
|
126
|
+
timeout=True,
|
|
127
|
+
duration_s=duration,
|
|
128
|
+
)
|
|
129
|
+
duration = time.perf_counter() - start
|
|
130
|
+
stdout = completed.stdout.decode("utf-8", errors="replace")
|
|
131
|
+
stderr = completed.stderr.decode("utf-8", errors="replace")
|
|
132
|
+
return RunResult(
|
|
133
|
+
passed=completed.returncode == 0,
|
|
134
|
+
stdout=stdout,
|
|
135
|
+
stderr=stderr,
|
|
136
|
+
timeout=False,
|
|
137
|
+
duration_s=duration,
|
|
138
|
+
)
|
|
139
|
+
finally:
|
|
140
|
+
# Always remove the temp file — even on TimeoutExpired / KeyboardInterrupt.
|
|
141
|
+
try:
|
|
142
|
+
os.unlink(tmp_path)
|
|
143
|
+
except OSError:
|
|
144
|
+
pass
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Pydantic schemas for code-generation benchmark specs + run context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated, Literal
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EngineKind(StrEnum):
|
|
13
|
+
"""Engines this plugin can drive.
|
|
14
|
+
|
|
15
|
+
Code-generation scoring is dominated by per-prompt API calls (one model
|
|
16
|
+
invocation per fixture row, then local execution of the response). We
|
|
17
|
+
surface the same four engine kinds the rest of the suite uses so the
|
|
18
|
+
plugin slots into existing cross-vendor comparisons unchanged.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
VLLM = "vllm"
|
|
22
|
+
SGLANG = "sglang"
|
|
23
|
+
OPENAI = "openai"
|
|
24
|
+
ANTHROPIC = "anthropic"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DatasetConfig(BaseModel):
|
|
28
|
+
"""Dataset under evaluation.
|
|
29
|
+
|
|
30
|
+
For the code-generation plugin the dataset is a small bundled JSONL
|
|
31
|
+
fixture; each line is one HumanEval-style task with ``task_id``,
|
|
32
|
+
``prompt``, ``tests``, ``canonical_solution`` and ``entry_point`` keys.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(extra="forbid")
|
|
36
|
+
id: Annotated[str, Field(min_length=1)]
|
|
37
|
+
path: Annotated[
|
|
38
|
+
str,
|
|
39
|
+
Field(
|
|
40
|
+
min_length=1,
|
|
41
|
+
description=("Path to the fixture JSONL relative to the plugin's datasets/ directory."),
|
|
42
|
+
),
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class WarmupConfig(BaseModel):
|
|
47
|
+
"""Warmup parameters.
|
|
48
|
+
|
|
49
|
+
Code-generation runs are per-task and order-independent so the default
|
|
50
|
+
is zero discarded runs. Knob retained for future revisions (warm-up of
|
|
51
|
+
a JIT-compiled model or sandbox cold-start, etc.).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
model_config = ConfigDict(extra="forbid")
|
|
55
|
+
discard_runs: Annotated[int, Field(ge=0)] = 0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class BenchmarkSpec(BaseModel):
|
|
59
|
+
"""One code-generation benchmark — fixture + scoring strategy + metadata."""
|
|
60
|
+
|
|
61
|
+
model_config = ConfigDict(extra="forbid")
|
|
62
|
+
benchmark_id: Annotated[str, Field(min_length=1)]
|
|
63
|
+
suite_version: Annotated[str, Field(pattern=r"^\d+\.\d+\.\d+(-[\w.]+)?$")]
|
|
64
|
+
description: str = ""
|
|
65
|
+
modality: Literal["code"] = "code"
|
|
66
|
+
kind: Literal["generation"] = "generation"
|
|
67
|
+
dataset: DatasetConfig
|
|
68
|
+
slo_template: str = "code.generation.standard"
|
|
69
|
+
warmup: WarmupConfig = Field(default_factory=WarmupConfig)
|
|
70
|
+
language: Literal["python"] = "python"
|
|
71
|
+
scoring: Literal["pass_at_1", "pass_at_k"] = "pass_at_1"
|
|
72
|
+
k: Annotated[int, Field(ge=1)] = 1
|
|
73
|
+
timeout_s: Annotated[float, Field(gt=0.0)] = 5.0
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class RunContext(BaseModel):
|
|
77
|
+
"""Per-invocation context (where to send requests, where to write results).
|
|
78
|
+
|
|
79
|
+
Mirrors the llm-quality plugin so cross-plugin tooling can reuse the
|
|
80
|
+
same context object shape.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
|
|
84
|
+
model_id: Annotated[str, Field(min_length=1)]
|
|
85
|
+
model_revision: Annotated[str, Field(min_length=7, max_length=40)] = "unknown00"
|
|
86
|
+
engine_kind: EngineKind
|
|
87
|
+
engine_version: str = ""
|
|
88
|
+
base_url: str = ""
|
|
89
|
+
api_key: str = ""
|
|
90
|
+
quantization_format: str = ""
|
|
91
|
+
hardware_class: str = ""
|
|
92
|
+
output_dir: Path
|
|
93
|
+
extra: dict[str, str | int | float | bool] = Field(default_factory=dict)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Pure scoring helpers for the code-generation plugin.
|
|
2
|
+
|
|
3
|
+
Two helpers:
|
|
4
|
+
|
|
5
|
+
* :func:`extract_python_code` extracts the first fenced ``python`` block
|
|
6
|
+
from a model response. The HumanEval prompting convention is to ask the
|
|
7
|
+
model to return code in a markdown fence; we strip the fence and return
|
|
8
|
+
the inner code. When no fence is present we treat the whole response
|
|
9
|
+
as code (some smaller models skip the fences when given a function
|
|
10
|
+
signature stub).
|
|
11
|
+
|
|
12
|
+
* :func:`compute_pass_at_k` returns the unbiased pass@k estimator from
|
|
13
|
+
the HumanEval paper (Chen et al. 2021): for ``n`` samples with ``c``
|
|
14
|
+
passing, ``pass@k = 1 - C(n-c, k) / C(n, k)``. For Phase 1 we only
|
|
15
|
+
run ``k=1`` (so this collapses to the mean) but the helper ships so
|
|
16
|
+
future revisions can compute pass@10/100 from richer sampling runs.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import math
|
|
22
|
+
import re
|
|
23
|
+
|
|
24
|
+
# Match a fenced python block — accept ``` python``` or ``` py ```. We
|
|
25
|
+
# extract only the inner body and intentionally do not require a closing
|
|
26
|
+
# newline so partial streamed responses still parse.
|
|
27
|
+
_PY_FENCE = re.compile(
|
|
28
|
+
r"```(?:python|py)\s*\n(.*?)(?:```|\Z)",
|
|
29
|
+
re.IGNORECASE | re.DOTALL,
|
|
30
|
+
)
|
|
31
|
+
# Bare ``` fence (no language tag) — accepted as a fallback when the
|
|
32
|
+
# model omits the language hint.
|
|
33
|
+
_BARE_FENCE = re.compile(r"```\s*\n(.*?)(?:```|\Z)", re.DOTALL)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_python_code(text: str) -> str:
|
|
37
|
+
"""Extract the first fenced ``python`` block from ``text``.
|
|
38
|
+
|
|
39
|
+
Returns the inner code with leading/trailing whitespace stripped.
|
|
40
|
+
Falls back to a bare triple-fence, then to the whole response (also
|
|
41
|
+
stripped) when no fence is present.
|
|
42
|
+
|
|
43
|
+
Multi-fence responses return the **first** block — HumanEval-style
|
|
44
|
+
prompts ask for one solution; later fences usually contain test
|
|
45
|
+
repetition or example I/O.
|
|
46
|
+
"""
|
|
47
|
+
match = _PY_FENCE.search(text)
|
|
48
|
+
if match is not None:
|
|
49
|
+
return match.group(1).strip()
|
|
50
|
+
match = _BARE_FENCE.search(text)
|
|
51
|
+
if match is not None:
|
|
52
|
+
return match.group(1).strip()
|
|
53
|
+
return text.strip()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def compute_pass_at_k(results: list[bool], k: int) -> float:
|
|
57
|
+
"""Return the HumanEval-paper unbiased pass@k estimator.
|
|
58
|
+
|
|
59
|
+
Arguments:
|
|
60
|
+
results: list of per-sample pass/fail booleans (one model attempt each).
|
|
61
|
+
k: how many top samples per task we would have picked.
|
|
62
|
+
|
|
63
|
+
Formula: ``pass@k = 1 - C(n - c, k) / C(n, k)`` where ``n = len(results)``
|
|
64
|
+
and ``c = sum(results)``. When ``n - c < k`` the binomial coefficient is
|
|
65
|
+
zero and pass@k collapses to 1.0 (every k-subset must contain at least
|
|
66
|
+
one passing sample). When ``k > n`` we clip ``k = n`` — the estimator is
|
|
67
|
+
only defined for ``k <= n``.
|
|
68
|
+
"""
|
|
69
|
+
n = len(results)
|
|
70
|
+
if n == 0:
|
|
71
|
+
return 0.0
|
|
72
|
+
if k < 1:
|
|
73
|
+
return 0.0
|
|
74
|
+
k = min(k, n)
|
|
75
|
+
c = sum(1 for r in results if r)
|
|
76
|
+
if n - c < k:
|
|
77
|
+
return 1.0
|
|
78
|
+
# math.comb is exact for ints; cast to float for the ratio.
|
|
79
|
+
return 1.0 - math.comb(n - c, k) / math.comb(n, k)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inferencebench-code
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Code-generation plugin for InferenceBench Suite (HumanEval-style execution-based scoring).
|
|
5
|
+
Project-URL: Homepage, https://github.com/yobitelcomm/bench
|
|
6
|
+
Author-email: Yobitel Communications <bench@yobitel.com>
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Keywords: ai,benchmark,code-generation,humaneval,llm,ml
|
|
9
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Requires-Dist: inferencebench-envelope
|
|
18
|
+
Requires-Dist: inferencebench-harness
|
|
19
|
+
Requires-Dist: pydantic~=2.9
|
|
20
|
+
Requires-Dist: pyyaml~=6.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# inferencebench-code
|
|
24
|
+
|
|
25
|
+
Code-generation plugin for the InferenceBench Suite.
|
|
26
|
+
|
|
27
|
+
HumanEval-style execution-based benchmarks: the plugin sends a function-signature
|
|
28
|
+
prompt to the model, extracts the Python code from its response, executes it
|
|
29
|
+
against bundled unit tests in a subprocess, and reports `pass_at_1`.
|
|
30
|
+
|
|
31
|
+
Suite ID: `code.generation`
|
|
32
|
+
|
|
33
|
+
Bundled benchmarks:
|
|
34
|
+
|
|
35
|
+
- `code.generation.humaneval-mini` — 5 stdlib-only Python tasks, `pass_at_1`
|
|
36
|
+
scoring with a 5-second per-task wall-clock timeout.
|
|
37
|
+
|
|
38
|
+
## SAFETY WARNING — read before running
|
|
39
|
+
|
|
40
|
+
**This plugin executes model-generated code.** Every run prints a yellow banner
|
|
41
|
+
reminding you of that. The execution layer is *best-effort* defence-in-depth,
|
|
42
|
+
not a real sandbox:
|
|
43
|
+
|
|
44
|
+
- Each task's solution + tests are written to a temp file and invoked with
|
|
45
|
+
`python -I` (isolated mode) under a `subprocess.run(timeout=...)` wall clock.
|
|
46
|
+
- A cheap substring pre-scan refuses any solution that imports `subprocess`,
|
|
47
|
+
`os.system`, `socket`, `urllib`, `multiprocessing`, or `ctypes`.
|
|
48
|
+
- The bundled fixtures are stdlib-only, no I/O, no network.
|
|
49
|
+
|
|
50
|
+
This is **deliberately not airtight**. Phase 2 adds real isolation (firejail /
|
|
51
|
+
nsjail / container-per-task). Until then: only run code-generation benchmarks
|
|
52
|
+
against models you trust, on machines you can afford to throw away, and never
|
|
53
|
+
against the bundled fixtures replaced with untrusted input.
|
|
54
|
+
|
|
55
|
+
## Metrics
|
|
56
|
+
|
|
57
|
+
The envelope's `metrics` block includes:
|
|
58
|
+
|
|
59
|
+
| Metric | Direction | Meaning |
|
|
60
|
+
| ------------------ | --------------- | ----------------------------------------- |
|
|
61
|
+
| `pass_at_1` | higher is better | mean of per-task passed booleans |
|
|
62
|
+
| `pass_at_1_p05/50/95` | higher is better | bootstrap quantiles of per-sample scores |
|
|
63
|
+
| `timeout_rate` | lower is better | fraction of tasks that hit the wall clock |
|
|
64
|
+
| `ttft_p50_ms` | - | model time-to-first-token, median |
|
|
65
|
+
| `total_p50_ms` | - | model total request time, median |
|
|
66
|
+
| `tokens_out_total` | - | total generated tokens across the run |
|
|
67
|
+
| `ok_rate` | - | fraction of model calls that succeeded |
|
|
68
|
+
| `n_samples` | - | fixture row count |
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
inferencebench_code/__init__.py,sha256=T29BdO0kbnz2t1ejan52jWySJme5kWFUIDxxTbi7-Wo,326
|
|
2
|
+
inferencebench_code/plugin.py,sha256=UGJm52c4EfUawk_lHdlbwtj8aOB_LNQQm6i5FBdB0OY,21096
|
|
3
|
+
inferencebench_code/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
inferencebench_code/runner.py,sha256=i2VxybIFNiIJaME1QCUz-EEVLk3GSAmKndd03nIL4ww,4759
|
|
5
|
+
inferencebench_code/schemas.py,sha256=PQIAlenv2bnw1zB-1rI2HX7U0U0ljJGVIAF1q5EMvbk,3099
|
|
6
|
+
inferencebench_code/scoring.py,sha256=uhfJ_zX3-sE6oFv-dAzrOLj8OhG6FwJK5SLRuP5Eyec,2968
|
|
7
|
+
inferencebench_code/benchmarks/humaneval-mini.yaml,sha256=PW1_C6cK0d0FMr52gVkR_4__2YwkxHefoocygADQo1o,373
|
|
8
|
+
inferencebench_code/benchmarks/mbpp-mini.yaml,sha256=DuJx0YMNpj6C0kBP9aRh4DPoUcRXsbNXDT_Fl3pcQG4,388
|
|
9
|
+
inferencebench_code/datasets/humaneval-mini.jsonl,sha256=x48LbK_KWBhxzXgpy_v0UdpJlqMC3d0gxXKatOKbnoo,2455
|
|
10
|
+
inferencebench_code/datasets/mbpp-mini.jsonl,sha256=5iGqpXO3K_Lkj_55gqjmrZAe6-liofdstz5weYZjTic,1944
|
|
11
|
+
inferencebench_code-0.0.2.dist-info/METADATA,sha256=-8HBN4l892swGJR9JReVn_D7netgrWkpSCb7Eb-qgXE,3207
|
|
12
|
+
inferencebench_code-0.0.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
13
|
+
inferencebench_code-0.0.2.dist-info/entry_points.txt,sha256=I7YPRKPTPjYYbctMhLt9NarFlxhsmQfS5lIkCZLW5_A,91
|
|
14
|
+
inferencebench_code-0.0.2.dist-info/RECORD,,
|