inferencebench-code 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ """InferenceBench code-generation plugin."""
2
+
3
+ from inferencebench_code.plugin import EXPECTED_METRICS, CodeGenerationPlugin
4
+ from inferencebench_code.schemas import BenchmarkSpec, EngineKind, RunContext
5
+
6
+ __all__ = [
7
+ "EXPECTED_METRICS",
8
+ "BenchmarkSpec",
9
+ "CodeGenerationPlugin",
10
+ "EngineKind",
11
+ "RunContext",
12
+ ]
@@ -0,0 +1,15 @@
1
+ benchmark_id: code.generation.humaneval-mini
2
+ suite_version: 1.0.0
3
+ description: Five stdlib-only Python tasks, pass@1 with a 5-second wall-clock timeout.
4
+ modality: code
5
+ kind: generation
6
+ dataset:
7
+ id: builtin-humaneval-mini
8
+ path: humaneval-mini.jsonl
9
+ slo_template: code.generation.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ language: python
13
+ scoring: pass_at_1
14
+ k: 1
15
+ timeout_s: 5.0
@@ -0,0 +1,15 @@
1
+ benchmark_id: code.generation.mbpp-mini
2
+ suite_version: 1.0.0
3
+ description: Mostly Basic Python Problems (MBPP)-style stdlib-only tasks, pass@1 with a 5-second wall-clock timeout.
4
+ modality: code
5
+ kind: generation
6
+ dataset:
7
+ id: builtin-mbpp-mini
8
+ path: mbpp-mini.jsonl
9
+ slo_template: code.generation.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ language: python
13
+ scoring: pass_at_1
14
+ k: 1
15
+ timeout_s: 5.0
@@ -0,0 +1,5 @@
1
+ {"task_id": "add_two_numbers", "entry_point": "add", "prompt": "def add(a, b):\n \"\"\"Return the sum of a and b.\"\"\"\n", "canonical_solution": "def add(a, b):\n return a + b\n", "tests": "assert add(1, 2) == 3\nassert add(0, 0) == 0\nassert add(-1, 1) == 0\nassert add(-5, -7) == -12\nassert add(100, 250) == 350\nassert add(2.5, 0.5) == 3.0\nassert add(-100, 100) == 0\n"}
2
+ {"task_id": "reverse_string", "entry_point": "reverse_string", "prompt": "def reverse_string(s):\n \"\"\"Return the reverse of the string s.\"\"\"\n", "canonical_solution": "def reverse_string(s):\n return s[::-1]\n", "tests": "assert reverse_string('') == ''\nassert reverse_string('a') == 'a'\nassert reverse_string('hello') == 'olleh'\nassert reverse_string('abcde') == 'edcba'\nassert reverse_string(' spaces ') == ' secaps '\nassert reverse_string('racecar') == 'racecar'\nassert reverse_string('AbC') == 'CbA'\n"}
3
+ {"task_id": "fibonacci_iter", "entry_point": "fib", "prompt": "def fib(n):\n \"\"\"Return the n-th Fibonacci number (0-indexed: fib(0)=0, fib(1)=1).\"\"\"\n", "canonical_solution": "def fib(n):\n a, b = 0, 1\n for _ in range(n):\n a, b = b, a + b\n return a\n", "tests": "assert fib(0) == 0\nassert fib(1) == 1\nassert fib(2) == 1\nassert fib(3) == 2\nassert fib(5) == 5\nassert fib(10) == 55\nassert fib(15) == 610\n"}
4
+ {"task_id": "count_vowels", "entry_point": "count_vowels", "prompt": "def count_vowels(s):\n \"\"\"Count the number of vowels (a, e, i, o, u) in the lowercase string s.\"\"\"\n", "canonical_solution": "def count_vowels(s):\n return sum(1 for c in s if c in 'aeiou')\n", "tests": "assert count_vowels('') == 0\nassert count_vowels('bcdfg') == 0\nassert count_vowels('aeiou') == 5\nassert count_vowels('hello') == 2\nassert count_vowels('python') == 1\nassert count_vowels('queue') == 4\nassert count_vowels('rhythm') == 0\n"}
5
+ {"task_id": "is_palindrome", "entry_point": "is_palindrome", "prompt": "def is_palindrome(s):\n \"\"\"Return True if s is a palindrome (case-insensitive). Empty string is a palindrome.\"\"\"\n", "canonical_solution": "def is_palindrome(s):\n t = s.lower()\n return t == t[::-1]\n", "tests": "assert is_palindrome('') is True\nassert is_palindrome('a') is True\nassert is_palindrome('racecar') is True\nassert is_palindrome('Racecar') is True\nassert is_palindrome('hello') is False\nassert is_palindrome('Level') is True\nassert is_palindrome('abba') is True\n"}
@@ -0,0 +1,5 @@
1
+ {"task_id": "mbpp-001", "prompt": "Write a function sum_list(items) that returns the sum of a list of integers.", "tests": "assert sum_list([1,2,3]) == 6\nassert sum_list([]) == 0\nassert sum_list([-1, -2, 3]) == 0\nassert sum_list([10, 20, 30, 40]) == 100\n", "canonical_solution": "def sum_list(items):\n return sum(items)\n", "entry_point": "sum_list"}
2
+ {"task_id": "mbpp-002", "prompt": "Write a function max_of_three(a, b, c) that returns the largest of three numbers.", "tests": "assert max_of_three(1, 2, 3) == 3\nassert max_of_three(5, 2, 4) == 5\nassert max_of_three(-1, -2, -3) == -1\nassert max_of_three(7, 7, 7) == 7\n", "canonical_solution": "def max_of_three(a, b, c):\n return max(a, b, c)\n", "entry_point": "max_of_three"}
3
+ {"task_id": "mbpp-003", "prompt": "Write a function count_evens(nums) that returns the count of even integers in the list nums.", "tests": "assert count_evens([1, 2, 3, 4]) == 2\nassert count_evens([]) == 0\nassert count_evens([2, 4, 6, 8]) == 4\nassert count_evens([1, 3, 5]) == 0\n", "canonical_solution": "def count_evens(nums):\n return sum(1 for n in nums if n % 2 == 0)\n", "entry_point": "count_evens"}
4
+ {"task_id": "mbpp-004", "prompt": "Write a function gcd(a, b) that returns the greatest common divisor of two non-negative integers.", "tests": "assert gcd(12, 18) == 6\nassert gcd(7, 5) == 1\nassert gcd(100, 75) == 25\nassert gcd(0, 9) == 9\n", "canonical_solution": "def gcd(a, b):\n while b:\n a, b = b, a % b\n return a\n", "entry_point": "gcd"}
5
+ {"task_id": "mbpp-005", "prompt": "Write a function unique_sorted(items) that returns a sorted list of the unique values in items.", "tests": "assert unique_sorted([3, 1, 2, 3, 1]) == [1, 2, 3]\nassert unique_sorted([]) == []\nassert unique_sorted([5]) == [5]\nassert unique_sorted([2, 2, 2, 2]) == [2]\n", "canonical_solution": "def unique_sorted(items):\n return sorted(set(items))\n", "entry_point": "unique_sorted"}
@@ -0,0 +1,532 @@
1
+ """CodeGenerationPlugin — entry point for ``code.generation`` benchmarks.
2
+
3
+ HumanEval-style execution-based scoring: for each fixture row we send the
4
+ function-signature prompt to the model, extract Python code from the
5
+ response, execute it against the bundled unit tests in an isolated
6
+ subprocess, and aggregate per-task pass/fail into a ``pass_at_1`` headline.
7
+
8
+ Structural twin of :class:`inferencebench_quality.plugin.LLMQualityPlugin`:
9
+ plugin contract, ModelClient wiring, signing flow, and envelope shape all
10
+ mirror that module so cross-plugin tooling (summary / compare / diff /
11
+ audit) treats code envelopes the same as quality envelopes.
12
+
13
+ **Safety:** :func:`run` prints a yellow banner on every invocation as a
14
+ reminder that model output is executed locally. See the package README for
15
+ the full safety boundary; ``runner.py`` is best-effort defence-in-depth, not
16
+ a sandbox.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import hashlib
22
+ import json
23
+ import math
24
+ import os
25
+ import sys
26
+ import time
27
+ from pathlib import Path
28
+
29
+ import yaml
30
+
31
+ from inferencebench.envelope import (
32
+ DatasetSpec as EnvDatasetSpec,
33
+ )
34
+ from inferencebench.envelope import (
35
+ EngineConfig,
36
+ Envelope,
37
+ EnvelopeBuilder,
38
+ ModelConfig,
39
+ Quantization,
40
+ SigningMode,
41
+ sign_envelope,
42
+ )
43
+ from inferencebench.harness import (
44
+ CompletionResult,
45
+ ModelClient,
46
+ Sample,
47
+ collect_hardware_fingerprint,
48
+ collect_software_provenance,
49
+ )
50
+ from inferencebench.harness.metrics import EnergyReport, Percentiles, TelemetryWindow
51
+ from inferencebench_code.runner import RunResult, run_unit_tests
52
+ from inferencebench_code.schemas import BenchmarkSpec, EngineKind, RunContext
53
+ from inferencebench_code.scoring import extract_python_code
54
+
55
+ _SAFETY_BANNER = (
56
+ "\033[33m" # yellow
57
+ "WARNING: code.generation executes model-generated Python locally. "
58
+ "The runner is best-effort (python -I subprocess with timeout + forbidden-import "
59
+ "pre-scan) — NOT a sandbox. Use only with trusted models and bundled fixtures."
60
+ "\033[0m"
61
+ )
62
+
63
+ # Engines that require ``base_url`` (self-hosted OpenAI-compatible servers).
64
+ _SELF_HOSTED_ENGINES = frozenset({EngineKind.VLLM, EngineKind.SGLANG})
65
+
66
+
67
+ def _fixtures_cache_root() -> Path:
68
+ """Resolve the bench-fixtures cache root for ``fixtures://`` dataset URIs."""
69
+ override = os.environ.get("BENCH_FIXTURES_ROOT")
70
+ if override:
71
+ return Path(override)
72
+ return Path.home() / ".cache" / "inferencebench" / "fixtures"
73
+
74
+
75
+ def _json_num(v: float) -> str:
76
+ """JSON-safe numeric encoder: NaN/inf become null."""
77
+ if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
78
+ return "null"
79
+ return repr(v)
80
+
81
+
82
+ def _compute_fixture_hash(items: list[dict[str, str]]) -> str:
83
+ """SHA-256 over the canonical-JSON-encoded fixture rows."""
84
+ canonical = json.dumps(items, sort_keys=True, separators=(",", ":"))
85
+ return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
86
+
87
+
88
+ def _build_client(context: RunContext, *, timeout_s: float = 60.0) -> ModelClient:
89
+ """Build a :class:`ModelClient` from the run context.
90
+
91
+ Same routing convention as the quality plugin: self-hosted OpenAI-
92
+ compatible engines (vLLM, SGLang) get the ``openai/`` LiteLLM prefix
93
+ applied exactly once; provider-hosted engines (OpenAI, Anthropic)
94
+ leave the model id untouched.
95
+ """
96
+ model_id = context.model_id
97
+ api_key: str | None
98
+ if context.engine_kind in _SELF_HOSTED_ENGINES:
99
+ if model_id.startswith("openai/"):
100
+ model_id = model_id[len("openai/") :]
101
+ model_id = f"openai/{model_id}"
102
+ api_key = context.api_key or "EMPTY"
103
+ else:
104
+ api_key = context.api_key or None
105
+ return ModelClient(
106
+ model=model_id,
107
+ api_key=api_key,
108
+ base_url=context.base_url or None,
109
+ timeout_s=timeout_s,
110
+ )
111
+
112
+
113
+ def _ensure_signature_present(prompt: str, generated: str) -> str:
114
+ """Prepend ``prompt`` (function signature + docstring) to ``generated`` if missing.
115
+
116
+ Some models reply with the function body only; we glue the signature back
117
+ on so the subprocess can call the named entry point. We detect "missing
118
+ signature" by looking for the first non-empty line of ``prompt`` (the
119
+ ``def ...`` line) in ``generated``; if it isn't present we prepend.
120
+ """
121
+ head = ""
122
+ for line in prompt.splitlines():
123
+ stripped = line.strip()
124
+ if stripped.startswith("def "):
125
+ head = stripped
126
+ break
127
+ if head and head not in generated:
128
+ return prompt.rstrip() + "\n" + generated
129
+ return generated
130
+
131
+
132
+ # Metrics this plugin is expected to emit. Consumed by ``bench coverage``.
133
+ EXPECTED_METRICS: tuple[str, ...] = (
134
+ "pass_at_1",
135
+ "pass_at_1_p05",
136
+ "pass_at_1_p50",
137
+ "pass_at_1_p95",
138
+ "timeout_rate",
139
+ "ok_rate",
140
+ "n_samples",
141
+ "ttft_p50_ms",
142
+ "total_p50_ms",
143
+ )
144
+
145
+
146
+ class CodeGenerationPlugin:
147
+ """Plugin entry point. Registered via ``inferencebench.plugins`` entrypoint group."""
148
+
149
+ suite_id = "code.generation"
150
+ version = "0.0.2"
151
+ description = (
152
+ "Code-generation benchmarks (HumanEval-style execution-based scoring; "
153
+ "executes model output locally — see README for the safety boundary)."
154
+ )
155
+
156
+ # ----------------------------------------------------------- benchmarks #
157
+ def list_benchmarks(self) -> list[BenchmarkSpec]:
158
+ bench_dir = self._benchmarks_dir()
159
+ specs: list[BenchmarkSpec] = []
160
+ if not bench_dir.exists():
161
+ return specs
162
+ for yml in sorted(bench_dir.glob("*.yaml")):
163
+ specs.append(self._load_yaml(yml))
164
+ return specs
165
+
166
+ def get_benchmark(self, benchmark_id: str) -> BenchmarkSpec:
167
+ for spec in self.list_benchmarks():
168
+ if spec.benchmark_id == benchmark_id:
169
+ return spec
170
+ msg = f"benchmark_id not found: {benchmark_id}"
171
+ raise KeyError(msg)
172
+
173
+ # ------------------------------------------------------------- validate #
174
+ def validate(self, spec: BenchmarkSpec, context: RunContext) -> list[str]:
175
+ warnings: list[str] = []
176
+ if not context.model_id:
177
+ warnings.append("model_id is empty")
178
+ if context.engine_kind in _SELF_HOSTED_ENGINES and not context.base_url:
179
+ warnings.append(
180
+ f"{context.engine_kind.value} needs base_url (e.g. http://localhost:8000/v1)"
181
+ )
182
+ if not self._dataset_path(spec).exists():
183
+ warnings.append(f"fixture not found: {spec.dataset.path}")
184
+ return warnings
185
+
186
+ # ------------------------------------------------------------------ run #
187
+ def run(self, spec: BenchmarkSpec, context: RunContext) -> Envelope:
188
+ """Execute the benchmark and return a SIGNED envelope.
189
+
190
+ Prints the safety banner to stderr on every call.
191
+ """
192
+ print(_SAFETY_BANNER, file=sys.stderr, flush=True)
193
+
194
+ client = _build_client(context)
195
+ items = self._load_fixture(spec)
196
+ fixture_hash = _compute_fixture_hash(items)
197
+
198
+ samples, passed_flags, timeout_flags, telemetry = self._score_items(
199
+ client, items, timeout_s=spec.timeout_s
200
+ )
201
+
202
+ # Best-effort diagnostic dump — never blocks the run on I/O errors.
203
+ self._dump_samples(context, samples)
204
+
205
+ envelope = self._build_envelope(
206
+ spec,
207
+ context,
208
+ samples=samples,
209
+ passed_flags=passed_flags,
210
+ timeout_flags=timeout_flags,
211
+ dataset_hash=fixture_hash,
212
+ energy=telemetry.summarise(samples),
213
+ )
214
+
215
+ signing_mode = context.extra.get("signing_mode", "dev")
216
+ dev_key_path = context.extra.get("dev_key_path")
217
+ if signing_mode == "dev":
218
+ if not dev_key_path:
219
+ msg = "dev signing requires context.extra['dev_key_path']"
220
+ raise ValueError(msg)
221
+ return sign_envelope(
222
+ envelope,
223
+ mode=SigningMode.DEV,
224
+ dev_key_path=Path(str(dev_key_path)),
225
+ )
226
+ return sign_envelope(envelope, mode=SigningMode.KEYLESS)
227
+
228
+ # -------------------------------------------------------- core scoring #
229
+ def _score_items(
230
+ self,
231
+ client: ModelClient,
232
+ items: list[dict[str, str]],
233
+ *,
234
+ timeout_s: float,
235
+ ) -> tuple[list[Sample], list[bool], list[bool], TelemetryWindow]:
236
+ """Iterate fixture items sequentially, scoring each model response.
237
+
238
+ For each fixture row:
239
+
240
+ 1. Send the prompt to the model (max_tokens=512, streamed).
241
+ 2. Extract a Python block from the response.
242
+ 3. Glue the function signature back on if the model omitted it.
243
+ 4. Execute the result against the row's unit tests with a
244
+ ``timeout_s`` wall clock.
245
+
246
+ Records a :class:`Sample` per row (including ``passed``,
247
+ ``duration_s``, ``timeout``, and an ``error_summary`` string) and
248
+ returns parallel lists of per-row ``passed`` and ``timeout`` flags
249
+ for the aggregator, plus the :class:`TelemetryWindow` for energy
250
+ accounting.
251
+ """
252
+ samples: list[Sample] = []
253
+ passed_flags: list[bool] = []
254
+ timeout_flags: list[bool] = []
255
+
256
+ telemetry = TelemetryWindow()
257
+ with telemetry:
258
+ for idx, item in enumerate(items):
259
+ prompt = item["prompt"]
260
+ tests = item["tests"]
261
+ t_arrival = time.perf_counter() * 1000.0
262
+ try:
263
+ result: CompletionResult = client.complete(
264
+ prompt, stream=True, max_tokens=512
265
+ )
266
+ except Exception as exc:
267
+ samples.append(
268
+ Sample(
269
+ request_idx=idx,
270
+ arrival_ms=t_arrival,
271
+ start_ms=t_arrival,
272
+ ttft_ms=float("nan"),
273
+ total_ms=float("nan"),
274
+ tpot_ms=float("nan"),
275
+ tokens_in=0,
276
+ tokens_out=0,
277
+ cost_usd=0.0,
278
+ finish_reason="error",
279
+ ok=False,
280
+ error=str(exc),
281
+ )
282
+ )
283
+ passed_flags.append(False)
284
+ timeout_flags.append(False)
285
+ continue
286
+
287
+ extracted = extract_python_code(result.text)
288
+ solution = _ensure_signature_present(prompt, extracted)
289
+ run_result: RunResult = run_unit_tests(solution, tests, timeout_s=timeout_s)
290
+ passed_flags.append(run_result.passed)
291
+ timeout_flags.append(run_result.timeout)
292
+
293
+ error_summary = self._summarize_error(run_result)
294
+ sample_extra: dict[str, str | int | float | bool] = {
295
+ "task_id": item.get("task_id", ""),
296
+ "passed": run_result.passed,
297
+ "duration_s": run_result.duration_s,
298
+ "timeout_flag": run_result.timeout,
299
+ }
300
+ if error_summary:
301
+ sample_extra["error_summary"] = error_summary
302
+
303
+ samples.append(
304
+ Sample(
305
+ request_idx=idx,
306
+ arrival_ms=t_arrival,
307
+ start_ms=t_arrival,
308
+ ttft_ms=result.ttft_ms,
309
+ total_ms=result.total_ms,
310
+ tpot_ms=result.tpot_ms,
311
+ tokens_in=result.tokens_in,
312
+ tokens_out=result.tokens_out,
313
+ cost_usd=result.cost_usd,
314
+ finish_reason=result.finish_reason,
315
+ ok=True,
316
+ extra=sample_extra,
317
+ )
318
+ )
319
+ return samples, passed_flags, timeout_flags, telemetry
320
+
321
+ @staticmethod
322
+ def _summarize_error(result: RunResult) -> str:
323
+ """Distil ``RunResult`` into a short ``error_summary`` string.
324
+
325
+ Empty when the run passed. Otherwise: ``"timeout"`` for wall-clock
326
+ kills, ``"forbidden_import: <name>"`` for pre-scan refusals (the
327
+ stderr is already shaped that way by the runner), and the last
328
+ line of stderr (typically the AssertionError or Exception class)
329
+ for normal failures.
330
+ """
331
+ if result.passed:
332
+ return ""
333
+ if result.timeout:
334
+ return "timeout"
335
+ if result.stderr.startswith("forbidden_import"):
336
+ return result.stderr.split("\n", 1)[0]
337
+ last_lines = [line.strip() for line in result.stderr.strip().splitlines() if line.strip()]
338
+ return last_lines[-1] if last_lines else "exit_nonzero"
339
+
340
+ # ------------------------------------------------------------ samples #
341
+ def _dump_samples(self, context: RunContext, samples: list[Sample]) -> None:
342
+ """Write per-task samples (incl. pass flag + duration) to ``samples-<ts>.jsonl``.
343
+
344
+ Mirrors the llm-quality plugin's diagnostic dump — failures here
345
+ never block the run.
346
+ """
347
+ try:
348
+ out_dir = Path(context.output_dir)
349
+ out_dir.mkdir(parents=True, exist_ok=True)
350
+ ts = int(time.time())
351
+ path = out_dir / f"samples-{ts}.jsonl"
352
+ with path.open("w", encoding="utf-8") as fp:
353
+ for s in samples:
354
+ extra = s.extra or {}
355
+ passed = bool(extra.get("passed", False))
356
+ duration = extra.get("duration_s")
357
+ timeout_flag = bool(extra.get("timeout_flag", False))
358
+ duration_part = (
359
+ ',"duration_s":' + _json_num(float(duration))
360
+ if isinstance(duration, (int, float))
361
+ else ""
362
+ )
363
+ fp.write(
364
+ '{"request_idx":'
365
+ + str(s.request_idx)
366
+ + ',"ok":'
367
+ + ("true" if s.ok else "false")
368
+ + ',"passed":'
369
+ + ("true" if passed else "false")
370
+ + ',"timeout":'
371
+ + ("true" if timeout_flag else "false")
372
+ + ',"ttft_ms":'
373
+ + _json_num(s.ttft_ms)
374
+ + ',"total_ms":'
375
+ + _json_num(s.total_ms)
376
+ + ',"tokens_out":'
377
+ + str(s.tokens_out)
378
+ + duration_part
379
+ + ',"finish_reason":"'
380
+ + (s.finish_reason or "")
381
+ + '"'
382
+ + "}\n"
383
+ )
384
+ except OSError:
385
+ pass # diagnostics-only — never block the run
386
+
387
+ # ---------------------------------------------------------- file paths #
388
+ def _benchmarks_dir(self) -> Path:
389
+ return Path(__file__).parent / "benchmarks"
390
+
391
+ def _datasets_dir(self) -> Path:
392
+ return Path(__file__).parent / "datasets"
393
+
394
+ def _dataset_path(self, spec: BenchmarkSpec) -> Path:
395
+ raw = spec.dataset.path
396
+ if raw.startswith("fixtures://"):
397
+ return _fixtures_cache_root() / f"{raw[len('fixtures://') :]}.jsonl"
398
+ return self._datasets_dir() / raw
399
+
400
+ def _load_yaml(self, path: Path) -> BenchmarkSpec:
401
+ raw = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
402
+ return BenchmarkSpec.model_validate(raw)
403
+
404
+ def _load_fixture(self, spec: BenchmarkSpec) -> list[dict[str, str]]:
405
+ path = self._dataset_path(spec)
406
+ if not path.exists():
407
+ if spec.dataset.path.startswith("fixtures://"):
408
+ key = spec.dataset.path[len("fixtures://") :]
409
+ msg = f"fixture not cached: {path}. Run `bench fixtures fetch {key}` first."
410
+ raise FileNotFoundError(msg)
411
+ msg = f"fixture not found: {path}"
412
+ raise FileNotFoundError(msg)
413
+ items: list[dict[str, str]] = []
414
+ with path.open("r", encoding="utf-8") as fp:
415
+ for line in fp:
416
+ line = line.strip()
417
+ if not line:
418
+ continue
419
+ obj = json.loads(line)
420
+ if not isinstance(obj, dict):
421
+ continue
422
+ if "task_id" not in obj or "prompt" not in obj or "tests" not in obj:
423
+ continue
424
+ items.append(
425
+ {
426
+ "task_id": str(obj["task_id"]),
427
+ "prompt": str(obj["prompt"]),
428
+ "tests": str(obj["tests"]),
429
+ "canonical_solution": str(obj.get("canonical_solution", "")),
430
+ "entry_point": str(obj.get("entry_point", "")),
431
+ }
432
+ )
433
+ if not items:
434
+ msg = f"fixture is empty: {path}"
435
+ raise ValueError(msg)
436
+ return items
437
+
438
+ # ---------------------------------------------------------- envelope #
439
+ def _build_envelope(
440
+ self,
441
+ spec: BenchmarkSpec,
442
+ context: RunContext,
443
+ *,
444
+ samples: list[Sample],
445
+ passed_flags: list[bool],
446
+ timeout_flags: list[bool],
447
+ dataset_hash: str,
448
+ energy: EnergyReport | None = None,
449
+ ) -> Envelope:
450
+ hw = collect_hardware_fingerprint()
451
+ sw = collect_software_provenance()
452
+
453
+ metrics: dict[str, float | int | str | None] = {}
454
+
455
+ ok_samples = [s for s in samples if s.ok]
456
+ n_total = len(samples)
457
+ metrics["n_samples"] = float(n_total)
458
+ metrics["n_ok"] = float(len(ok_samples))
459
+ metrics["ok_rate"] = float(len(ok_samples)) / float(n_total) if n_total else 0.0
460
+
461
+ if passed_flags:
462
+ scores = [1.0 if p else 0.0 for p in passed_flags]
463
+ mean_pass = sum(scores) / len(scores)
464
+ metrics["pass_at_1"] = mean_pass
465
+ if len(scores) >= 2:
466
+ pcts = Percentiles(scores, percentiles=(5.0, 50.0, 95.0))
467
+ metrics["pass_at_1_p05"] = pcts.p5
468
+ metrics["pass_at_1_p50"] = pcts.p50
469
+ metrics["pass_at_1_p95"] = pcts.p95
470
+ else:
471
+ metrics["pass_at_1_p05"] = mean_pass
472
+ metrics["pass_at_1_p50"] = mean_pass
473
+ metrics["pass_at_1_p95"] = mean_pass
474
+
475
+ if timeout_flags:
476
+ metrics["timeout_rate"] = float(sum(timeout_flags)) / float(len(timeout_flags))
477
+
478
+ # Latency aggregates — useful for "quality at what cost" comparisons.
479
+ ttft_vals = [s.ttft_ms for s in ok_samples if math.isfinite(s.ttft_ms)]
480
+ total_vals = [s.total_ms for s in ok_samples if math.isfinite(s.total_ms)]
481
+ if ttft_vals:
482
+ metrics["ttft_p50_ms"] = Percentiles(ttft_vals).p50
483
+ if total_vals:
484
+ metrics["total_p50_ms"] = Percentiles(total_vals).p50
485
+
486
+ # Energy / power summary from telemetry (None on plugins that haven't
487
+ # threaded a TelemetryWindow through yet). Mirrors llm-inference.
488
+ if energy is not None:
489
+ if energy.gpu_power_avg_w > 0:
490
+ metrics["power_avg_w"] = energy.gpu_power_avg_w
491
+ metrics["power_peak_w"] = energy.gpu_power_peak_w
492
+ if energy.total_energy_joules > 0:
493
+ metrics["energy_joules_total"] = energy.total_energy_joules
494
+ if energy.joules_per_token == energy.joules_per_token: # not NaN
495
+ metrics["joules_per_token"] = energy.joules_per_token
496
+
497
+ tokens_out_total = sum(s.tokens_out for s in ok_samples)
498
+ if tokens_out_total:
499
+ metrics["tokens_out_total"] = float(tokens_out_total)
500
+
501
+ cost_total = sum(s.cost_usd for s in ok_samples)
502
+ if tokens_out_total and cost_total > 0:
503
+ metrics["cost_usd_per_million_tokens"] = (cost_total / tokens_out_total) * 1e6
504
+ metrics["cost_source"] = "provider"
505
+
506
+ builder = EnvelopeBuilder(
507
+ suite_id=spec.benchmark_id,
508
+ suite_version=spec.suite_version,
509
+ model=ModelConfig(
510
+ id=context.model_id,
511
+ revision=context.model_revision,
512
+ provider=context.engine_kind.value,
513
+ endpoint_hash="0" * 64,
514
+ ),
515
+ engine=EngineConfig(
516
+ name=context.engine_kind.value,
517
+ version=context.engine_version or "unknown",
518
+ config_hash="0" * 64,
519
+ ),
520
+ hardware_fingerprint=hw,
521
+ software_provenance=sw,
522
+ dataset=EnvDatasetSpec(id=spec.dataset.id, hash=dataset_hash),
523
+ seed=0,
524
+ quantization=(
525
+ Quantization(format=context.quantization_format)
526
+ if context.quantization_format
527
+ else None
528
+ ),
529
+ metrics=metrics,
530
+ slo_template=spec.slo_template,
531
+ )
532
+ return builder.build()
File without changes
@@ -0,0 +1,144 @@
1
+ """Subprocess-based unit-test runner for the code-generation plugin.
2
+
3
+ Executes a model-generated Python solution alongside fixture unit tests in
4
+ an isolated ``python -I`` subprocess with a wall-clock timeout. **This is
5
+ not a real sandbox.** See ``README.md`` for the safety boundary; the
6
+ shortlist of forbidden-import heuristics here is defence-in-depth, not
7
+ defence-in-full.
8
+
9
+ Each :func:`run_unit_tests` invocation writes solution + tests to a
10
+ temporary file, runs it under ``subprocess.run`` with the supplied
11
+ ``timeout_s``, captures stdout/stderr, and returns a :class:`RunResult`.
12
+ The temp file is always deleted via ``try/finally`` even on timeout or
13
+ unhandled errors.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import subprocess
20
+ import sys
21
+ import tempfile
22
+ import time
23
+ from dataclasses import dataclass
24
+
25
+ # Substrings that immediately disqualify a solution. Cheap pre-scan, not
26
+ # a parser — there are dozens of ways around it (eval, __import__, etc).
27
+ # That's fine; this is one of several layers, and the bundled fixtures
28
+ # never exercise stdlib edges.
29
+ _FORBIDDEN_IMPORTS: tuple[str, ...] = (
30
+ "subprocess",
31
+ "os.system",
32
+ "socket",
33
+ "urllib",
34
+ "multiprocessing",
35
+ "ctypes",
36
+ )
37
+
38
+
39
+ @dataclass(frozen=True, slots=True)
40
+ class RunResult:
41
+ """Outcome of one subprocess execution.
42
+
43
+ ``passed`` is True only when the subprocess exited 0 within the wall
44
+ clock. ``timeout`` is True when ``subprocess.TimeoutExpired`` fired
45
+ (in which case ``duration_s`` reflects the timeout, not the real
46
+ wall time). ``stdout`` and ``stderr`` are decoded UTF-8 strings.
47
+ """
48
+
49
+ passed: bool
50
+ stdout: str
51
+ stderr: str
52
+ timeout: bool
53
+ duration_s: float
54
+
55
+
56
+ def _scan_forbidden(solution: str) -> str | None:
57
+ """Return the first forbidden token found in ``solution`` or None.
58
+
59
+ Case-sensitive substring match; we accept the false-negative risk
60
+ in exchange for a vanishingly small false-positive rate.
61
+ """
62
+ for token in _FORBIDDEN_IMPORTS:
63
+ if token in solution:
64
+ return token
65
+ return None
66
+
67
+
68
+ def run_unit_tests(
69
+ solution: str,
70
+ tests: str,
71
+ *,
72
+ timeout_s: float = 5.0,
73
+ ) -> RunResult:
74
+ """Execute ``solution + tests`` in an isolated subprocess and report pass/fail.
75
+
76
+ Arguments:
77
+ solution: Python code defining the function under test.
78
+ tests: Python code that imports / references the function (already
79
+ in scope because solution + tests are concatenated into one
80
+ file) and exercises it with ``assert`` statements.
81
+ timeout_s: Wall-clock budget. The subprocess is killed when it
82
+ elapses and :class:`RunResult` is returned with ``timeout=True``.
83
+
84
+ Returns:
85
+ :class:`RunResult` with ``passed``, captured streams, timeout flag,
86
+ and observed duration in seconds.
87
+ """
88
+ forbidden = _scan_forbidden(solution)
89
+ if forbidden is not None:
90
+ return RunResult(
91
+ passed=False,
92
+ stdout="",
93
+ stderr=f"forbidden_import: solution references {forbidden!r}",
94
+ timeout=False,
95
+ duration_s=0.0,
96
+ )
97
+
98
+ body = solution + "\n\n# --- tests ---\n" + tests + "\n"
99
+ # delete=False so we control the lifetime; the finally block removes it.
100
+ tmp = tempfile.NamedTemporaryFile(
101
+ mode="w",
102
+ suffix=".py",
103
+ delete=False,
104
+ encoding="utf-8",
105
+ )
106
+ tmp_path = tmp.name
107
+ try:
108
+ tmp.write(body)
109
+ tmp.close()
110
+ start = time.perf_counter()
111
+ try:
112
+ completed = subprocess.run(
113
+ [sys.executable, "-I", tmp_path],
114
+ capture_output=True,
115
+ timeout=timeout_s,
116
+ check=False,
117
+ )
118
+ except subprocess.TimeoutExpired as exc:
119
+ duration = time.perf_counter() - start
120
+ stdout_bytes = exc.stdout or b""
121
+ stderr_bytes = exc.stderr or b""
122
+ return RunResult(
123
+ passed=False,
124
+ stdout=stdout_bytes.decode("utf-8", errors="replace"),
125
+ stderr=stderr_bytes.decode("utf-8", errors="replace"),
126
+ timeout=True,
127
+ duration_s=duration,
128
+ )
129
+ duration = time.perf_counter() - start
130
+ stdout = completed.stdout.decode("utf-8", errors="replace")
131
+ stderr = completed.stderr.decode("utf-8", errors="replace")
132
+ return RunResult(
133
+ passed=completed.returncode == 0,
134
+ stdout=stdout,
135
+ stderr=stderr,
136
+ timeout=False,
137
+ duration_s=duration,
138
+ )
139
+ finally:
140
+ # Always remove the temp file — even on TimeoutExpired / KeyboardInterrupt.
141
+ try:
142
+ os.unlink(tmp_path)
143
+ except OSError:
144
+ pass
@@ -0,0 +1,93 @@
1
+ """Pydantic schemas for code-generation benchmark specs + run context."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import StrEnum
6
+ from pathlib import Path
7
+ from typing import Annotated, Literal
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+
12
+ class EngineKind(StrEnum):
13
+ """Engines this plugin can drive.
14
+
15
+ Code-generation scoring is dominated by per-prompt API calls (one model
16
+ invocation per fixture row, then local execution of the response). We
17
+ surface the same four engine kinds the rest of the suite uses so the
18
+ plugin slots into existing cross-vendor comparisons unchanged.
19
+ """
20
+
21
+ VLLM = "vllm"
22
+ SGLANG = "sglang"
23
+ OPENAI = "openai"
24
+ ANTHROPIC = "anthropic"
25
+
26
+
27
+ class DatasetConfig(BaseModel):
28
+ """Dataset under evaluation.
29
+
30
+ For the code-generation plugin the dataset is a small bundled JSONL
31
+ fixture; each line is one HumanEval-style task with ``task_id``,
32
+ ``prompt``, ``tests``, ``canonical_solution`` and ``entry_point`` keys.
33
+ """
34
+
35
+ model_config = ConfigDict(extra="forbid")
36
+ id: Annotated[str, Field(min_length=1)]
37
+ path: Annotated[
38
+ str,
39
+ Field(
40
+ min_length=1,
41
+ description=("Path to the fixture JSONL relative to the plugin's datasets/ directory."),
42
+ ),
43
+ ]
44
+
45
+
46
+ class WarmupConfig(BaseModel):
47
+ """Warmup parameters.
48
+
49
+ Code-generation runs are per-task and order-independent so the default
50
+ is zero discarded runs. Knob retained for future revisions (warm-up of
51
+ a JIT-compiled model or sandbox cold-start, etc.).
52
+ """
53
+
54
+ model_config = ConfigDict(extra="forbid")
55
+ discard_runs: Annotated[int, Field(ge=0)] = 0
56
+
57
+
58
+ class BenchmarkSpec(BaseModel):
59
+ """One code-generation benchmark — fixture + scoring strategy + metadata."""
60
+
61
+ model_config = ConfigDict(extra="forbid")
62
+ benchmark_id: Annotated[str, Field(min_length=1)]
63
+ suite_version: Annotated[str, Field(pattern=r"^\d+\.\d+\.\d+(-[\w.]+)?$")]
64
+ description: str = ""
65
+ modality: Literal["code"] = "code"
66
+ kind: Literal["generation"] = "generation"
67
+ dataset: DatasetConfig
68
+ slo_template: str = "code.generation.standard"
69
+ warmup: WarmupConfig = Field(default_factory=WarmupConfig)
70
+ language: Literal["python"] = "python"
71
+ scoring: Literal["pass_at_1", "pass_at_k"] = "pass_at_1"
72
+ k: Annotated[int, Field(ge=1)] = 1
73
+ timeout_s: Annotated[float, Field(gt=0.0)] = 5.0
74
+
75
+
76
+ class RunContext(BaseModel):
77
+ """Per-invocation context (where to send requests, where to write results).
78
+
79
+ Mirrors the llm-quality plugin so cross-plugin tooling can reuse the
80
+ same context object shape.
81
+ """
82
+
83
+ model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)
84
+ model_id: Annotated[str, Field(min_length=1)]
85
+ model_revision: Annotated[str, Field(min_length=7, max_length=40)] = "unknown00"
86
+ engine_kind: EngineKind
87
+ engine_version: str = ""
88
+ base_url: str = ""
89
+ api_key: str = ""
90
+ quantization_format: str = ""
91
+ hardware_class: str = ""
92
+ output_dir: Path
93
+ extra: dict[str, str | int | float | bool] = Field(default_factory=dict)
@@ -0,0 +1,79 @@
1
+ """Pure scoring helpers for the code-generation plugin.
2
+
3
+ Two helpers:
4
+
5
+ * :func:`extract_python_code` extracts the first fenced ``python`` block
6
+ from a model response. The HumanEval prompting convention is to ask the
7
+ model to return code in a markdown fence; we strip the fence and return
8
+ the inner code. When no fence is present we treat the whole response
9
+ as code (some smaller models skip the fences when given a function
10
+ signature stub).
11
+
12
+ * :func:`compute_pass_at_k` returns the unbiased pass@k estimator from
13
+ the HumanEval paper (Chen et al. 2021): for ``n`` samples with ``c``
14
+ passing, ``pass@k = 1 - C(n-c, k) / C(n, k)``. For Phase 1 we only
15
+ run ``k=1`` (so this collapses to the mean) but the helper ships so
16
+ future revisions can compute pass@10/100 from richer sampling runs.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import math
22
+ import re
23
+
24
+ # Match a fenced python block — accept ``` python``` or ``` py ```. We
25
+ # extract only the inner body and intentionally do not require a closing
26
+ # newline so partial streamed responses still parse.
27
+ _PY_FENCE = re.compile(
28
+ r"```(?:python|py)\s*\n(.*?)(?:```|\Z)",
29
+ re.IGNORECASE | re.DOTALL,
30
+ )
31
+ # Bare ``` fence (no language tag) — accepted as a fallback when the
32
+ # model omits the language hint.
33
+ _BARE_FENCE = re.compile(r"```\s*\n(.*?)(?:```|\Z)", re.DOTALL)
34
+
35
+
36
+ def extract_python_code(text: str) -> str:
37
+ """Extract the first fenced ``python`` block from ``text``.
38
+
39
+ Returns the inner code with leading/trailing whitespace stripped.
40
+ Falls back to a bare triple-fence, then to the whole response (also
41
+ stripped) when no fence is present.
42
+
43
+ Multi-fence responses return the **first** block — HumanEval-style
44
+ prompts ask for one solution; later fences usually contain test
45
+ repetition or example I/O.
46
+ """
47
+ match = _PY_FENCE.search(text)
48
+ if match is not None:
49
+ return match.group(1).strip()
50
+ match = _BARE_FENCE.search(text)
51
+ if match is not None:
52
+ return match.group(1).strip()
53
+ return text.strip()
54
+
55
+
56
+ def compute_pass_at_k(results: list[bool], k: int) -> float:
57
+ """Return the HumanEval-paper unbiased pass@k estimator.
58
+
59
+ Arguments:
60
+ results: list of per-sample pass/fail booleans (one model attempt each).
61
+ k: how many top samples per task we would have picked.
62
+
63
+ Formula: ``pass@k = 1 - C(n - c, k) / C(n, k)`` where ``n = len(results)``
64
+ and ``c = sum(results)``. When ``n - c < k`` the binomial coefficient is
65
+ zero and pass@k collapses to 1.0 (every k-subset must contain at least
66
+ one passing sample). When ``k > n`` we clip ``k = n`` — the estimator is
67
+ only defined for ``k <= n``.
68
+ """
69
+ n = len(results)
70
+ if n == 0:
71
+ return 0.0
72
+ if k < 1:
73
+ return 0.0
74
+ k = min(k, n)
75
+ c = sum(1 for r in results if r)
76
+ if n - c < k:
77
+ return 1.0
78
+ # math.comb is exact for ints; cast to float for the ratio.
79
+ return 1.0 - math.comb(n - c, k) / math.comb(n, k)
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: inferencebench-code
3
+ Version: 0.0.2
4
+ Summary: Code-generation plugin for InferenceBench Suite (HumanEval-style execution-based scoring).
5
+ Project-URL: Homepage, https://github.com/yobitelcomm/bench
6
+ Author-email: Yobitel Communications <bench@yobitel.com>
7
+ License: Apache-2.0
8
+ Keywords: ai,benchmark,code-generation,humaneval,llm,ml
9
+ Classifier: Development Status :: 2 - Pre-Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: inferencebench-envelope
18
+ Requires-Dist: inferencebench-harness
19
+ Requires-Dist: pydantic~=2.9
20
+ Requires-Dist: pyyaml~=6.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # inferencebench-code
24
+
25
+ Code-generation plugin for the InferenceBench Suite.
26
+
27
+ HumanEval-style execution-based benchmarks: the plugin sends a function-signature
28
+ prompt to the model, extracts the Python code from its response, executes it
29
+ against bundled unit tests in a subprocess, and reports `pass_at_1`.
30
+
31
+ Suite ID: `code.generation`
32
+
33
+ Bundled benchmarks:
34
+
35
+ - `code.generation.humaneval-mini` — 5 stdlib-only Python tasks, `pass_at_1`
36
+ scoring with a 5-second per-task wall-clock timeout.
37
+
38
+ ## SAFETY WARNING — read before running
39
+
40
+ **This plugin executes model-generated code.** Every run prints a yellow banner
41
+ reminding you of that. The execution layer is *best-effort* defence-in-depth,
42
+ not a real sandbox:
43
+
44
+ - Each task's solution + tests are written to a temp file and invoked with
45
+ `python -I` (isolated mode) under a `subprocess.run(timeout=...)` wall clock.
46
+ - A cheap substring pre-scan refuses any solution that imports `subprocess`,
47
+ `os.system`, `socket`, `urllib`, `multiprocessing`, or `ctypes`.
48
+ - The bundled fixtures are stdlib-only, no I/O, no network.
49
+
50
+ This is **deliberately not airtight**. Phase 2 adds real isolation (firejail /
51
+ nsjail / container-per-task). Until then: only run code-generation benchmarks
52
+ against models you trust, on machines you can afford to throw away, and never
53
+ against the bundled fixtures replaced with untrusted input.
54
+
55
+ ## Metrics
56
+
57
+ The envelope's `metrics` block includes:
58
+
59
+ | Metric | Direction | Meaning |
60
+ | ------------------ | --------------- | ----------------------------------------- |
61
+ | `pass_at_1` | higher is better | mean of per-task passed booleans |
62
+ | `pass_at_1_p05/50/95` | higher is better | bootstrap quantiles of per-sample scores |
63
+ | `timeout_rate` | lower is better | fraction of tasks that hit the wall clock |
64
+ | `ttft_p50_ms` | - | model time-to-first-token, median |
65
+ | `total_p50_ms` | - | model total request time, median |
66
+ | `tokens_out_total` | - | total generated tokens across the run |
67
+ | `ok_rate` | - | fraction of model calls that succeeded |
68
+ | `n_samples` | - | fixture row count |
@@ -0,0 +1,14 @@
1
+ inferencebench_code/__init__.py,sha256=T29BdO0kbnz2t1ejan52jWySJme5kWFUIDxxTbi7-Wo,326
2
+ inferencebench_code/plugin.py,sha256=UGJm52c4EfUawk_lHdlbwtj8aOB_LNQQm6i5FBdB0OY,21096
3
+ inferencebench_code/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ inferencebench_code/runner.py,sha256=i2VxybIFNiIJaME1QCUz-EEVLk3GSAmKndd03nIL4ww,4759
5
+ inferencebench_code/schemas.py,sha256=PQIAlenv2bnw1zB-1rI2HX7U0U0ljJGVIAF1q5EMvbk,3099
6
+ inferencebench_code/scoring.py,sha256=uhfJ_zX3-sE6oFv-dAzrOLj8OhG6FwJK5SLRuP5Eyec,2968
7
+ inferencebench_code/benchmarks/humaneval-mini.yaml,sha256=PW1_C6cK0d0FMr52gVkR_4__2YwkxHefoocygADQo1o,373
8
+ inferencebench_code/benchmarks/mbpp-mini.yaml,sha256=DuJx0YMNpj6C0kBP9aRh4DPoUcRXsbNXDT_Fl3pcQG4,388
9
+ inferencebench_code/datasets/humaneval-mini.jsonl,sha256=x48LbK_KWBhxzXgpy_v0UdpJlqMC3d0gxXKatOKbnoo,2455
10
+ inferencebench_code/datasets/mbpp-mini.jsonl,sha256=5iGqpXO3K_Lkj_55gqjmrZAe6-liofdstz5weYZjTic,1944
11
+ inferencebench_code-0.0.2.dist-info/METADATA,sha256=-8HBN4l892swGJR9JReVn_D7netgrWkpSCb7Eb-qgXE,3207
12
+ inferencebench_code-0.0.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
13
+ inferencebench_code-0.0.2.dist-info/entry_points.txt,sha256=I7YPRKPTPjYYbctMhLt9NarFlxhsmQfS5lIkCZLW5_A,91
14
+ inferencebench_code-0.0.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [inferencebench.plugins]
2
+ code.generation = inferencebench_code.plugin:CodeGenerationPlugin