inferencebench-code 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,137 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+
25
+ # uv / virtualenv
26
+ .venv/
27
+ venv/
28
+ env/
29
+ ENV/
30
+ uv.lock.tmp
31
+ .python-version
32
+
33
+ # Testing / coverage
34
+ .tox/
35
+ .nox/
36
+ .coverage
37
+ .coverage.*
38
+ .cache
39
+ nosetests.xml
40
+ coverage.xml
41
+ *.cover
42
+ *.py,cover
43
+ .hypothesis/
44
+ .pytest_cache/
45
+ cover/
46
+ htmlcov/
47
+
48
+ # Type checking
49
+ .mypy_cache/
50
+ .dmypy.json
51
+ dmypy.json
52
+ .pyre/
53
+ .pytype/
54
+
55
+ # Ruff
56
+ .ruff_cache/
57
+
58
+ # IDE / editor
59
+ .idea/
60
+ .vscode/
61
+ *.swp
62
+ *.swo
63
+ *~
64
+ .DS_Store
65
+
66
+ # OS
67
+ Thumbs.db
68
+ desktop.ini
69
+
70
+ # Secrets / env
71
+ .env
72
+ .env.*
73
+ !.env.example
74
+ .envrc
75
+
76
+ # Bench-specific local caches
77
+ ~/.cache/inferencebench/
78
+ .cache/inferencebench/
79
+ .inferencebench/
80
+
81
+ # Sigstore dev keys (never commit private keys)
82
+ cosign.key
83
+ cosign-*.key
84
+ cosign-*.pub
85
+ .bench/*.key
86
+ # Local benchmark working dirs (kept local; published outputs land under validation-runs/)
87
+ envelopes-voice/
88
+ envelopes-*/
89
+ *.pem
90
+ !tests/fixtures/**/*.pem
91
+
92
+ # Real-GPU validation artifacts (kept locally, never pushed)
93
+ # Use slash-star (not trailing slash) so individual subpaths can be re-included below.
94
+ validation-runs/*
95
+ # ...except the canonical published marathon corpus — small, public, used by docs + CI
96
+ !validation-runs/2026-05-18-multi-vendor-marathon
97
+ validation-runs/2026-05-18-multi-vendor-marathon/*
98
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon
99
+ validation-runs/2026-05-18-multi-vendor-marathon/marathon/*
100
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon/all
101
+ !validation-runs/2026-05-18-multi-vendor-marathon/marathon/all/*.json
102
+ # Voice ASR validation envelopes (small, public, used by leaderboard build)
103
+ !validation-runs/2026-05-25-voice-rtx4000ada
104
+ !validation-runs/2026-05-25-voice-rtx4000ada/*.json
105
+ !validation-runs/2026-05-29-voice-testbm-h100
106
+ !validation-runs/2026-05-29-voice-testbm-h100/*.json
107
+
108
+ # Model weights / datasets (use Git LFS or S3)
109
+ *.bin
110
+ *.safetensors
111
+ *.pt
112
+ *.pth
113
+ *.gguf
114
+ *.onnx
115
+ *.parquet
116
+ !tests/fixtures/**/*.parquet
117
+
118
+ # Logs
119
+ *.log
120
+ logs/
121
+
122
+ # Documentation build
123
+ docs/_build/
124
+ site/
125
+
126
+ # Internal-only files (Claude Code context + planning) — kept locally, not pushed
127
+ /CLAUDE.md
128
+ /INDEX.md
129
+ /PROJECT_PLAN.md
130
+ /CONVENTIONS.md
131
+ /HUMAN_REVIEW_GATES.md
132
+ **/CLAUDE.md
133
+ memory/
134
+ skills/
135
+ agents/
136
+ .claude/
137
+ TICKETS/
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: inferencebench-code
3
+ Version: 0.0.2
4
+ Summary: Code-generation plugin for InferenceBench Suite (HumanEval-style execution-based scoring).
5
+ Project-URL: Homepage, https://github.com/yobitelcomm/bench
6
+ Author-email: Yobitel Communications <bench@yobitel.com>
7
+ License: Apache-2.0
8
+ Keywords: ai,benchmark,code-generation,humaneval,llm,ml
9
+ Classifier: Development Status :: 2 - Pre-Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: inferencebench-envelope
18
+ Requires-Dist: inferencebench-harness
19
+ Requires-Dist: pydantic~=2.9
20
+ Requires-Dist: pyyaml~=6.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # inferencebench-code
24
+
25
+ Code-generation plugin for the InferenceBench Suite.
26
+
27
+ HumanEval-style execution-based benchmarks: the plugin sends a function-signature
28
+ prompt to the model, extracts the Python code from its response, executes it
29
+ against bundled unit tests in a subprocess, and reports `pass_at_1`.
30
+
31
+ Suite ID: `code.generation`
32
+
33
+ Bundled benchmarks:
34
+
35
+ - `code.generation.humaneval-mini` — 5 stdlib-only Python tasks, `pass_at_1`
36
+ scoring with a 5-second per-task wall-clock timeout.
37
+
38
+ ## SAFETY WARNING — read before running
39
+
40
+ **This plugin executes model-generated code.** Every run prints a yellow banner
41
+ reminding you of that. The execution layer is *best-effort* defence-in-depth,
42
+ not a real sandbox:
43
+
44
+ - Each task's solution + tests are written to a temp file and invoked with
45
+ `python -I` (isolated mode) under a `subprocess.run(timeout=...)` wall clock.
46
+ - A cheap substring pre-scan refuses any solution that imports `subprocess`,
47
+ `os.system`, `socket`, `urllib`, `multiprocessing`, or `ctypes`.
48
+ - The bundled fixtures are stdlib-only, no I/O, no network.
49
+
50
+ This is **deliberately not airtight**. Phase 2 adds real isolation (firejail /
51
+ nsjail / container-per-task). Until then: only run code-generation benchmarks
52
+ against models you trust, on machines you can afford to throw away, and never
53
+ against the bundled fixtures replaced with untrusted input.
54
+
55
+ ## Metrics
56
+
57
+ The envelope's `metrics` block includes:
58
+
59
+ | Metric | Direction | Meaning |
60
+ | ------------------ | --------------- | ----------------------------------------- |
61
+ | `pass_at_1` | higher is better | mean of per-task passed booleans |
62
+ | `pass_at_1_p05/50/95` | higher is better | bootstrap quantiles of per-sample scores |
63
+ | `timeout_rate` | lower is better | fraction of tasks that hit the wall clock |
64
+ | `ttft_p50_ms` | - | model time-to-first-token, median |
65
+ | `total_p50_ms` | - | model total request time, median |
66
+ | `tokens_out_total` | - | total generated tokens across the run |
67
+ | `ok_rate` | - | fraction of model calls that succeeded |
68
+ | `n_samples` | - | fixture row count |
@@ -0,0 +1,46 @@
1
+ # inferencebench-code
2
+
3
+ Code-generation plugin for the InferenceBench Suite.
4
+
5
+ HumanEval-style execution-based benchmarks: the plugin sends a function-signature
6
+ prompt to the model, extracts the Python code from its response, executes it
7
+ against bundled unit tests in a subprocess, and reports `pass_at_1`.
8
+
9
+ Suite ID: `code.generation`
10
+
11
+ Bundled benchmarks:
12
+
13
+ - `code.generation.humaneval-mini` — 5 stdlib-only Python tasks, `pass_at_1`
14
+ scoring with a 5-second per-task wall-clock timeout.
15
+
16
+ ## SAFETY WARNING — read before running
17
+
18
+ **This plugin executes model-generated code.** Every run prints a yellow banner
19
+ reminding you of that. The execution layer is *best-effort* defence-in-depth,
20
+ not a real sandbox:
21
+
22
+ - Each task's solution + tests are written to a temp file and invoked with
23
+ `python -I` (isolated mode) under a `subprocess.run(timeout=...)` wall clock.
24
+ - A cheap substring pre-scan refuses any solution that imports `subprocess`,
25
+ `os.system`, `socket`, `urllib`, `multiprocessing`, or `ctypes`.
26
+ - The bundled fixtures are stdlib-only, no I/O, no network.
27
+
28
+ This is **deliberately not airtight**. Phase 2 adds real isolation (firejail /
29
+ nsjail / container-per-task). Until then: only run code-generation benchmarks
30
+ against models you trust, on machines you can afford to throw away, and never
31
+ against the bundled fixtures replaced with untrusted input.
32
+
33
+ ## Metrics
34
+
35
+ The envelope's `metrics` block includes:
36
+
37
+ | Metric | Direction | Meaning |
38
+ | ------------------ | --------------- | ----------------------------------------- |
39
+ | `pass_at_1` | higher is better | mean of per-task passed booleans |
40
+ | `pass_at_1_p05/50/95` | higher is better | bootstrap quantiles of per-sample scores |
41
+ | `timeout_rate` | lower is better | fraction of tasks that hit the wall clock |
42
+ | `ttft_p50_ms` | - | model time-to-first-token, median |
43
+ | `total_p50_ms` | - | model total request time, median |
44
+ | `tokens_out_total` | - | total generated tokens across the run |
45
+ | `ok_rate` | - | fraction of model calls that succeeded |
46
+ | `n_samples` | - | fixture row count |
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "inferencebench-code"
7
+ version = "0.0.2"
8
+ description = "Code-generation plugin for InferenceBench Suite (HumanEval-style execution-based scoring)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [
13
+ { name = "Yobitel Communications", email = "bench@yobitel.com" },
14
+ ]
15
+ keywords = ["benchmark", "llm", "code-generation", "humaneval", "ai", "ml"]
16
+ classifiers = [
17
+ "Development Status :: 2 - Pre-Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "License :: OSI Approved :: Apache Software License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ ]
25
+ dependencies = [
26
+ "inferencebench-envelope",
27
+ "inferencebench-harness",
28
+ "pydantic~=2.9",
29
+ "pyyaml~=6.0",
30
+ ]
31
+
32
+ [project.entry-points."inferencebench.plugins"]
33
+ "code.generation" = "inferencebench_code.plugin:CodeGenerationPlugin"
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/yobitelcomm/bench"
37
+
38
+ [tool.hatch.build.targets.wheel]
39
+ packages = ["src/inferencebench_code"]
40
+
41
+ [tool.uv.sources]
42
+ inferencebench-envelope = { workspace = true }
43
+ inferencebench-harness = { workspace = true }
@@ -0,0 +1,12 @@
1
+ """InferenceBench code-generation plugin."""
2
+
3
+ from inferencebench_code.plugin import EXPECTED_METRICS, CodeGenerationPlugin
4
+ from inferencebench_code.schemas import BenchmarkSpec, EngineKind, RunContext
5
+
6
+ __all__ = [
7
+ "EXPECTED_METRICS",
8
+ "BenchmarkSpec",
9
+ "CodeGenerationPlugin",
10
+ "EngineKind",
11
+ "RunContext",
12
+ ]
@@ -0,0 +1,15 @@
1
+ benchmark_id: code.generation.humaneval-mini
2
+ suite_version: 1.0.0
3
+ description: Five stdlib-only Python tasks, pass@1 with a 5-second wall-clock timeout.
4
+ modality: code
5
+ kind: generation
6
+ dataset:
7
+ id: builtin-humaneval-mini
8
+ path: humaneval-mini.jsonl
9
+ slo_template: code.generation.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ language: python
13
+ scoring: pass_at_1
14
+ k: 1
15
+ timeout_s: 5.0
@@ -0,0 +1,15 @@
1
+ benchmark_id: code.generation.mbpp-mini
2
+ suite_version: 1.0.0
3
+ description: Mostly Basic Python Problems (MBPP)-style stdlib-only tasks, pass@1 with a 5-second wall-clock timeout.
4
+ modality: code
5
+ kind: generation
6
+ dataset:
7
+ id: builtin-mbpp-mini
8
+ path: mbpp-mini.jsonl
9
+ slo_template: code.generation.standard
10
+ warmup:
11
+ discard_runs: 0
12
+ language: python
13
+ scoring: pass_at_1
14
+ k: 1
15
+ timeout_s: 5.0
@@ -0,0 +1,5 @@
1
+ {"task_id": "add_two_numbers", "entry_point": "add", "prompt": "def add(a, b):\n \"\"\"Return the sum of a and b.\"\"\"\n", "canonical_solution": "def add(a, b):\n return a + b\n", "tests": "assert add(1, 2) == 3\nassert add(0, 0) == 0\nassert add(-1, 1) == 0\nassert add(-5, -7) == -12\nassert add(100, 250) == 350\nassert add(2.5, 0.5) == 3.0\nassert add(-100, 100) == 0\n"}
2
+ {"task_id": "reverse_string", "entry_point": "reverse_string", "prompt": "def reverse_string(s):\n \"\"\"Return the reverse of the string s.\"\"\"\n", "canonical_solution": "def reverse_string(s):\n return s[::-1]\n", "tests": "assert reverse_string('') == ''\nassert reverse_string('a') == 'a'\nassert reverse_string('hello') == 'olleh'\nassert reverse_string('abcde') == 'edcba'\nassert reverse_string(' spaces ') == ' secaps '\nassert reverse_string('racecar') == 'racecar'\nassert reverse_string('AbC') == 'CbA'\n"}
3
+ {"task_id": "fibonacci_iter", "entry_point": "fib", "prompt": "def fib(n):\n \"\"\"Return the n-th Fibonacci number (0-indexed: fib(0)=0, fib(1)=1).\"\"\"\n", "canonical_solution": "def fib(n):\n a, b = 0, 1\n for _ in range(n):\n a, b = b, a + b\n return a\n", "tests": "assert fib(0) == 0\nassert fib(1) == 1\nassert fib(2) == 1\nassert fib(3) == 2\nassert fib(5) == 5\nassert fib(10) == 55\nassert fib(15) == 610\n"}
4
+ {"task_id": "count_vowels", "entry_point": "count_vowels", "prompt": "def count_vowels(s):\n \"\"\"Count the number of vowels (a, e, i, o, u) in the lowercase string s.\"\"\"\n", "canonical_solution": "def count_vowels(s):\n return sum(1 for c in s if c in 'aeiou')\n", "tests": "assert count_vowels('') == 0\nassert count_vowels('bcdfg') == 0\nassert count_vowels('aeiou') == 5\nassert count_vowels('hello') == 2\nassert count_vowels('python') == 1\nassert count_vowels('queue') == 4\nassert count_vowels('rhythm') == 0\n"}
5
+ {"task_id": "is_palindrome", "entry_point": "is_palindrome", "prompt": "def is_palindrome(s):\n \"\"\"Return True if s is a palindrome (case-insensitive). Empty string is a palindrome.\"\"\"\n", "canonical_solution": "def is_palindrome(s):\n t = s.lower()\n return t == t[::-1]\n", "tests": "assert is_palindrome('') is True\nassert is_palindrome('a') is True\nassert is_palindrome('racecar') is True\nassert is_palindrome('Racecar') is True\nassert is_palindrome('hello') is False\nassert is_palindrome('Level') is True\nassert is_palindrome('abba') is True\n"}
@@ -0,0 +1,5 @@
1
+ {"task_id": "mbpp-001", "prompt": "Write a function sum_list(items) that returns the sum of a list of integers.", "tests": "assert sum_list([1,2,3]) == 6\nassert sum_list([]) == 0\nassert sum_list([-1, -2, 3]) == 0\nassert sum_list([10, 20, 30, 40]) == 100\n", "canonical_solution": "def sum_list(items):\n return sum(items)\n", "entry_point": "sum_list"}
2
+ {"task_id": "mbpp-002", "prompt": "Write a function max_of_three(a, b, c) that returns the largest of three numbers.", "tests": "assert max_of_three(1, 2, 3) == 3\nassert max_of_three(5, 2, 4) == 5\nassert max_of_three(-1, -2, -3) == -1\nassert max_of_three(7, 7, 7) == 7\n", "canonical_solution": "def max_of_three(a, b, c):\n return max(a, b, c)\n", "entry_point": "max_of_three"}
3
+ {"task_id": "mbpp-003", "prompt": "Write a function count_evens(nums) that returns the count of even integers in the list nums.", "tests": "assert count_evens([1, 2, 3, 4]) == 2\nassert count_evens([]) == 0\nassert count_evens([2, 4, 6, 8]) == 4\nassert count_evens([1, 3, 5]) == 0\n", "canonical_solution": "def count_evens(nums):\n return sum(1 for n in nums if n % 2 == 0)\n", "entry_point": "count_evens"}
4
+ {"task_id": "mbpp-004", "prompt": "Write a function gcd(a, b) that returns the greatest common divisor of two non-negative integers.", "tests": "assert gcd(12, 18) == 6\nassert gcd(7, 5) == 1\nassert gcd(100, 75) == 25\nassert gcd(0, 9) == 9\n", "canonical_solution": "def gcd(a, b):\n while b:\n a, b = b, a % b\n return a\n", "entry_point": "gcd"}
5
+ {"task_id": "mbpp-005", "prompt": "Write a function unique_sorted(items) that returns a sorted list of the unique values in items.", "tests": "assert unique_sorted([3, 1, 2, 3, 1]) == [1, 2, 3]\nassert unique_sorted([]) == []\nassert unique_sorted([5]) == [5]\nassert unique_sorted([2, 2, 2, 2]) == [2]\n", "canonical_solution": "def unique_sorted(items):\n return sorted(set(items))\n", "entry_point": "unique_sorted"}