benchleak 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchleak-0.1.0/.claude/settings.local.json +26 -0
- benchleak-0.1.0/.gitignore +231 -0
- benchleak-0.1.0/LICENSE +21 -0
- benchleak-0.1.0/PKG-INFO +156 -0
- benchleak-0.1.0/README.md +129 -0
- benchleak-0.1.0/benchleak/__init__.py +3 -0
- benchleak-0.1.0/benchleak/cli.py +83 -0
- benchleak-0.1.0/benchleak/core.py +100 -0
- benchleak-0.1.0/benchleak/data/__init__.py +43 -0
- benchleak-0.1.0/benchleak/data/reference.txt +41 -0
- benchleak-0.1.0/benchleak/detectors/__init__.py +0 -0
- benchleak-0.1.0/benchleak/detectors/pretrain.py +100 -0
- benchleak-0.1.0/benchleak/detectors/rl.py +1 -0
- benchleak-0.1.0/benchleak/detectors/sft.py +1 -0
- benchleak-0.1.0/benchleak/loading.py +144 -0
- benchleak-0.1.0/benchleak/report.py +48 -0
- benchleak-0.1.0/docs/how-it-works.md +170 -0
- benchleak-0.1.0/pyproject.toml +51 -0
- benchleak-0.1.0/scripts/smoke.py +106 -0
- benchleak-0.1.0/tests/__init__.py +0 -0
- benchleak-0.1.0/tests/test_core.py +108 -0
- benchleak-0.1.0/tests/test_data.py +27 -0
- benchleak-0.1.0/tests/test_loading.py +46 -0
- benchleak-0.1.0/tests/test_pretrain.py +135 -0
- benchleak-0.1.0/tests/test_report.py +32 -0
- benchleak-0.1.0/tests/test_rl.py +0 -0
- benchleak-0.1.0/tests/test_sft.py +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(python3 -m venv .venv)",
|
|
5
|
+
"Bash(.venv/bin/python -m pip install --upgrade pip -q)",
|
|
6
|
+
"Bash(python3 -c \"import pypdf\")",
|
|
7
|
+
"Bash(python3 -c \"import pdfminer\")",
|
|
8
|
+
"Bash(.venv/bin/pip install *)",
|
|
9
|
+
"Bash(.venv/bin/python -c ' *)",
|
|
10
|
+
"Bash(.venv/bin/python *)",
|
|
11
|
+
"Bash(git add *)",
|
|
12
|
+
"Bash(git commit *)",
|
|
13
|
+
"Bash(git push *)",
|
|
14
|
+
"Bash(echo \"exit code: $?\")",
|
|
15
|
+
"Bash(brew --prefix xz)",
|
|
16
|
+
"Bash(pyenv --version)",
|
|
17
|
+
"Bash(echo \"venv python -> $\\(readlink -f .venv/bin/python 2>/dev/null || .venv/bin/python -c 'import sys;print\\(sys.executable\\)'\\)\")",
|
|
18
|
+
"Bash(LDFLAGS=\"-L/opt/homebrew/opt/xz/lib\" CPPFLAGS=\"-I/opt/homebrew/opt/xz/include\" pyenv install -f 3.10.11)",
|
|
19
|
+
"Bash(/Users/mouadbouchnaf/.pyenv/versions/3.10.11/bin/python3.10 -c \"import lzma; print\\('base lzma ok'\\)\")",
|
|
20
|
+
"Bash(grep -E \"reference.txt|data/|\\\\.py$|RECORD\")",
|
|
21
|
+
"Bash(.venv/bin/twine check *)",
|
|
22
|
+
"WebFetch(domain:pypi.org)",
|
|
23
|
+
"Bash(unzip -p dist/*.whl '*/METADATA')"
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
# .idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
# .vscode/
|
|
203
|
+
# Temporary file for partial code execution
|
|
204
|
+
tempCodeRunnerFile.py
|
|
205
|
+
|
|
206
|
+
# Ruff stuff:
|
|
207
|
+
.ruff_cache/
|
|
208
|
+
|
|
209
|
+
# PyPI configuration file
|
|
210
|
+
.pypirc
|
|
211
|
+
|
|
212
|
+
# Marimo
|
|
213
|
+
marimo/_static/
|
|
214
|
+
marimo/_lsp/
|
|
215
|
+
__marimo__/
|
|
216
|
+
|
|
217
|
+
# Streamlit
|
|
218
|
+
.streamlit/secrets.toml
|
|
219
|
+
|
|
220
|
+
# benchleak
|
|
221
|
+
# JetBrains IDE config
|
|
222
|
+
.idea/
|
|
223
|
+
# Model weights and HuggingFace caches
|
|
224
|
+
*.safetensors
|
|
225
|
+
*.bin
|
|
226
|
+
.cache/huggingface/
|
|
227
|
+
# Reference papers (kept local, not committed)
|
|
228
|
+
papers/
|
|
229
|
+
# Generated contamination reports
|
|
230
|
+
reports/
|
|
231
|
+
*.DS_Store
|
benchleak-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 bouchnam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
benchleak-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: benchleak
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect benchmark contamination in large language models
|
|
5
|
+
Project-URL: Homepage, https://github.com/bouchnam/benchleak
|
|
6
|
+
Project-URL: Repository, https://github.com/bouchnam/benchleak
|
|
7
|
+
Author: Mouad Bouchnaf
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: benchmark,contamination,evaluation,llm,memorization
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Requires-Dist: datasets>=2.18
|
|
18
|
+
Requires-Dist: numpy>=1.24
|
|
19
|
+
Requires-Dist: rich>=13.0
|
|
20
|
+
Requires-Dist: scipy>=1.11
|
|
21
|
+
Requires-Dist: torch>=2.0
|
|
22
|
+
Requires-Dist: transformers>=4.40
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# benchleak 🔍
|
|
29
|
+
|
|
30
|
+
> **Did this model train on the test set? Find out in one command.**
|
|
31
|
+
|
|
32
|
+
When a model scores 90% on GSM8K or MATH, was it genuinely capable or did it
|
|
33
|
+
memorise the benchmark during training? Benchleak answers that with a
|
|
34
|
+
mathematical membership-inference test on the model's own token probabilities.
|
|
35
|
+
No LLM judges, no API calls, runs locally on any HuggingFace causal LM.
|
|
36
|
+
|
|
37
|
+
## Status
|
|
38
|
+
|
|
39
|
+
🚧 **Early alpha.** The **pre-training** detector (Min-K% Prob) is implemented and
|
|
40
|
+
runnable end to end. The SFT and RL-post-training detectors are planned but **not
|
|
41
|
+
yet built**, so don't expect them yet.
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install benchleak
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
This pulls in `torch`, `transformers`, and `datasets`. To work from a clone
|
|
50
|
+
instead, run `pip install -e .` in the repo root.
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
benchleak --model Qwen/Qwen2.5-0.5B --benchmark gsm8k
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The model can be any HuggingFace-format causal LM, given as a Hub id or a local
|
|
59
|
+
checkpoint directory. GGUF, llama.cpp, and Ollama formats are not supported.
|
|
60
|
+
|
|
61
|
+
### Private or gated models
|
|
62
|
+
|
|
63
|
+
For a repository that requires authentication, provide a HuggingFace token. Any
|
|
64
|
+
of these work:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
huggingface-cli login # cached credential, picked up automatically
|
|
68
|
+
export HF_TOKEN=hf_xxx # environment variable
|
|
69
|
+
benchleak --model my/private-model --benchmark gsm8k --hf-token hf_xxx
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Speed
|
|
73
|
+
|
|
74
|
+
Scoring runs one forward pass per sample. On a CPU-only machine the default
|
|
75
|
+
`--limit 200` can take many minutes; use a smaller `--limit` for a quick look, or
|
|
76
|
+
`--device cuda` / `--device mps` to use a GPU.
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
benchleak: pre-training contamination report
|
|
80
|
+
====================================================
|
|
81
|
+
Model: Qwen/Qwen2.5-0.5B
|
|
82
|
+
Benchmark: gsm8k
|
|
83
|
+
Detector: min-k% prob
|
|
84
|
+
Samples: 200 benchmark vs 200 reference
|
|
85
|
+
|
|
86
|
+
Separation (AUC): 0.71 [HIGH]
|
|
87
|
+
Significance (p): 3.2e-08
|
|
88
|
+
Flag thresholds: AUC >= 0.6, p < 0.05
|
|
89
|
+
|
|
90
|
+
Verdict: LIKELY CONTAMINATED
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Known benchmarks (`gsm8k`, `math`, `arc-challenge`, `truthfulqa`) work by name. For
|
|
94
|
+
any other Hub dataset, pass the path plus its text column(s):
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
benchleak --model my/model --benchmark some/dataset --field question --field answer
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## How it works
|
|
101
|
+
|
|
102
|
+
The benchmark is scored against a **reference set** of text the model is not
|
|
103
|
+
expected to have memorised. Min-K% Prob assigns each text the mean log-probability
|
|
104
|
+
of its least-likely *k%* of tokens. Memorised text has fewer surprising tokens
|
|
105
|
+
and scores higher. The tool then measures how strongly the benchmark's scores separate
|
|
106
|
+
from the reference's, reported as an AUC (`U / nm` from a Mann-Whitney test) with
|
|
107
|
+
a significance p-value. AUC ≈ 0.5 means the benchmark looks like fresh data; AUC
|
|
108
|
+
well above 0.5 is the memorisation signature of contamination.
|
|
109
|
+
|
|
110
|
+
A small reference set ships with the tool so it runs out of the box. For the
|
|
111
|
+
cleanest signal, supply your own domain-matched reference data with
|
|
112
|
+
`--reference my_reference.txt` (one passage per line).
|
|
113
|
+
|
|
114
|
+
For the full reasoning (why a reference set is needed, the choice of test, and
|
|
115
|
+
where the method can mislead), see [docs/how-it-works.md](docs/how-it-works.md).
|
|
116
|
+
|
|
117
|
+
| Phase | Method | Status |
|
|
118
|
+
|-------|--------|--------|
|
|
119
|
+
| Pre-training | Min-K% probability (Shi et al. 2024) | ✅ implemented |
|
|
120
|
+
| SFT | Self-prompt calibration (Fu et al. 2024) | ⏳ planned |
|
|
121
|
+
| RL post-training | Self-Critique entropy (Tao et al. 2025) | ⏳ planned |
|
|
122
|
+
|
|
123
|
+
## Caveats
|
|
124
|
+
|
|
125
|
+
- A verdict needs **≥ 5 samples per side**; the significance test cannot reach
|
|
126
|
+
p < 0.05 below that.
|
|
127
|
+
- The bundled reference is general-domain prose. Comparing it against a
|
|
128
|
+
narrow-domain benchmark (e.g. math) can confound *domain* with *memorisation*;
|
|
129
|
+
prefer a domain-matched `--reference` for results you intend to publish.
|
|
130
|
+
|
|
131
|
+
## Troubleshooting
|
|
132
|
+
|
|
133
|
+
**`ModuleNotFoundError: No module named '_lzma'`** when loading a benchmark. The
|
|
134
|
+
`datasets` library needs Python's `lzma` module, which is absent from some Python
|
|
135
|
+
builds (commonly pyenv on macOS compiled without the `xz` library). Install `xz`
|
|
136
|
+
and rebuild Python:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
brew install xz
|
|
140
|
+
LDFLAGS="-L$(brew --prefix xz)/lib" CPPFLAGS="-I$(brew --prefix xz)/include" \
|
|
141
|
+
pyenv install -f <your-python-version>
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Then reuse or recreate your virtual environment. benchleak detects this case and
|
|
145
|
+
prints the same guidance.
|
|
146
|
+
|
|
147
|
+
## Citation
|
|
148
|
+
|
|
149
|
+
Implements methods from:
|
|
150
|
+
- Shi et al., 2024. *Detecting Pretraining Data from Large Language Models* (arXiv:2310.16789)
|
|
151
|
+
- Fu et al., 2024. *Membership Inference via Self-Prompt Calibration*
|
|
152
|
+
- Tao et al., 2025. *Detecting Data Contamination from RL Post-training* (arXiv:2510.09259)
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
Apache-2.0
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# benchleak 🔍
|
|
2
|
+
|
|
3
|
+
> **Did this model train on the test set? Find out in one command.**
|
|
4
|
+
|
|
5
|
+
When a model scores 90% on GSM8K or MATH, was it genuinely capable or did it
|
|
6
|
+
memorise the benchmark during training? Benchleak answers that with a
|
|
7
|
+
mathematical membership-inference test on the model's own token probabilities.
|
|
8
|
+
No LLM judges, no API calls, runs locally on any HuggingFace causal LM.
|
|
9
|
+
|
|
10
|
+
## Status
|
|
11
|
+
|
|
12
|
+
🚧 **Early alpha.** The **pre-training** detector (Min-K% Prob) is implemented and
|
|
13
|
+
runnable end to end. The SFT and RL-post-training detectors are planned but **not
|
|
14
|
+
yet built**, so don't expect them yet.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install benchleak
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
This pulls in `torch`, `transformers`, and `datasets`. To work from a clone
|
|
23
|
+
instead, run `pip install -e .` in the repo root.
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
benchleak --model Qwen/Qwen2.5-0.5B --benchmark gsm8k
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
The model can be any HuggingFace-format causal LM, given as a Hub id or a local
|
|
32
|
+
checkpoint directory. GGUF, llama.cpp, and Ollama formats are not supported.
|
|
33
|
+
|
|
34
|
+
### Private or gated models
|
|
35
|
+
|
|
36
|
+
For a repository that requires authentication, provide a HuggingFace token. Any
|
|
37
|
+
of these work:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
huggingface-cli login # cached credential, picked up automatically
|
|
41
|
+
export HF_TOKEN=hf_xxx # environment variable
|
|
42
|
+
benchleak --model my/private-model --benchmark gsm8k --hf-token hf_xxx
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Speed
|
|
46
|
+
|
|
47
|
+
Scoring runs one forward pass per sample. On a CPU-only machine the default
|
|
48
|
+
`--limit 200` can take many minutes; use a smaller `--limit` for a quick look, or
|
|
49
|
+
`--device cuda` / `--device mps` to use a GPU.
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
benchleak: pre-training contamination report
|
|
53
|
+
====================================================
|
|
54
|
+
Model: Qwen/Qwen2.5-0.5B
|
|
55
|
+
Benchmark: gsm8k
|
|
56
|
+
Detector: min-k% prob
|
|
57
|
+
Samples: 200 benchmark vs 200 reference
|
|
58
|
+
|
|
59
|
+
Separation (AUC): 0.71 [HIGH]
|
|
60
|
+
Significance (p): 3.2e-08
|
|
61
|
+
Flag thresholds: AUC >= 0.6, p < 0.05
|
|
62
|
+
|
|
63
|
+
Verdict: LIKELY CONTAMINATED
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Known benchmarks (`gsm8k`, `math`, `arc-challenge`, `truthfulqa`) work by name. For
|
|
67
|
+
any other Hub dataset, pass the path plus its text column(s):
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
benchleak --model my/model --benchmark some/dataset --field question --field answer
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## How it works
|
|
74
|
+
|
|
75
|
+
The benchmark is scored against a **reference set** of text the model is not
|
|
76
|
+
expected to have memorised. Min-K% Prob assigns each text the mean log-probability
|
|
77
|
+
of its least-likely *k%* of tokens. Memorised text has fewer surprising tokens
|
|
78
|
+
and scores higher. The tool then measures how strongly the benchmark's scores separate
|
|
79
|
+
from the reference's, reported as an AUC (`U / nm` from a Mann-Whitney test) with
|
|
80
|
+
a significance p-value. AUC ≈ 0.5 means the benchmark looks like fresh data; AUC
|
|
81
|
+
well above 0.5 is the memorisation signature of contamination.
|
|
82
|
+
|
|
83
|
+
A small reference set ships with the tool so it runs out of the box. For the
|
|
84
|
+
cleanest signal, supply your own domain-matched reference data with
|
|
85
|
+
`--reference my_reference.txt` (one passage per line).
|
|
86
|
+
|
|
87
|
+
For the full reasoning (why a reference set is needed, the choice of test, and
|
|
88
|
+
where the method can mislead), see [docs/how-it-works.md](docs/how-it-works.md).
|
|
89
|
+
|
|
90
|
+
| Phase | Method | Status |
|
|
91
|
+
|-------|--------|--------|
|
|
92
|
+
| Pre-training | Min-K% probability (Shi et al. 2024) | ✅ implemented |
|
|
93
|
+
| SFT | Self-prompt calibration (Fu et al. 2024) | ⏳ planned |
|
|
94
|
+
| RL post-training | Self-Critique entropy (Tao et al. 2025) | ⏳ planned |
|
|
95
|
+
|
|
96
|
+
## Caveats
|
|
97
|
+
|
|
98
|
+
- A verdict needs **≥ 5 samples per side**; the significance test cannot reach
|
|
99
|
+
p < 0.05 below that.
|
|
100
|
+
- The bundled reference is general-domain prose. Comparing it against a
|
|
101
|
+
narrow-domain benchmark (e.g. math) can confound *domain* with *memorisation*;
|
|
102
|
+
prefer a domain-matched `--reference` for results you intend to publish.
|
|
103
|
+
|
|
104
|
+
## Troubleshooting
|
|
105
|
+
|
|
106
|
+
**`ModuleNotFoundError: No module named '_lzma'`** when loading a benchmark. The
|
|
107
|
+
`datasets` library needs Python's `lzma` module, which is absent from some Python
|
|
108
|
+
builds (commonly pyenv on macOS compiled without the `xz` library). Install `xz`
|
|
109
|
+
and rebuild Python:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
brew install xz
|
|
113
|
+
LDFLAGS="-L$(brew --prefix xz)/lib" CPPFLAGS="-I$(brew --prefix xz)/include" \
|
|
114
|
+
pyenv install -f <your-python-version>
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Then reuse or recreate your virtual environment. benchleak detects this case and
|
|
118
|
+
prints the same guidance.
|
|
119
|
+
|
|
120
|
+
## Citation
|
|
121
|
+
|
|
122
|
+
Implements methods from:
|
|
123
|
+
- Shi et al., 2024. *Detecting Pretraining Data from Large Language Models* (arXiv:2310.16789)
|
|
124
|
+
- Fu et al., 2024. *Membership Inference via Self-Prompt Calibration*
|
|
125
|
+
- Tao et al., 2025. *Detecting Data Contamination from RL Post-training* (arXiv:2510.09259)
|
|
126
|
+
|
|
127
|
+
## License
|
|
128
|
+
|
|
129
|
+
Apache-2.0
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Command line interface for benchleak.
|
|
2
|
+
|
|
3
|
+
benchleak --model Qwen/Qwen2.5-0.5B --benchmark gsm8k
|
|
4
|
+
|
|
5
|
+
Loads a HuggingFace model and a benchmark, scores both the benchmark and a
|
|
6
|
+
reference set with the Min-K% pre-training detector, and prints a contamination
|
|
7
|
+
report. Exits non-zero when the benchmark is flagged as likely contaminated.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
from . import __version__
|
|
17
|
+
from .core import scan
|
|
18
|
+
from .data import load_reference
|
|
19
|
+
from .detectors.pretrain import DEFAULT_K, MinKProbDetector
|
|
20
|
+
from .loading import load_benchmark, load_model, resolve_spec
|
|
21
|
+
from .report import format_report
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
prog="benchleak",
|
|
27
|
+
description="Detect pre-training benchmark contamination in an LLM (Min-K% Prob).",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument("--model", required=True, help="HuggingFace model id, e.g. Qwen/Qwen2.5-0.5B")
|
|
30
|
+
parser.add_argument("--benchmark", required=True, help="benchmark name (gsm8k, math, ...) or a Hub dataset path")
|
|
31
|
+
parser.add_argument("--reference", help="path to a reference-text file (one passage per line); defaults to the bundled set")
|
|
32
|
+
parser.add_argument("--field", action="append", dest="fields", help="benchmark text column(s); repeatable. Required for an unknown benchmark")
|
|
33
|
+
parser.add_argument("--config", help="dataset config/subset name")
|
|
34
|
+
parser.add_argument("--split", help="dataset split (default depends on the benchmark)")
|
|
35
|
+
parser.add_argument("--limit", type=int, default=200, help="max samples per side (default: 200)")
|
|
36
|
+
parser.add_argument("--k", type=float, default=DEFAULT_K, help=f"Min-K%% percentage (default: {DEFAULT_K})")
|
|
37
|
+
parser.add_argument("--max-length", type=int, default=2048, help="truncate texts to this many tokens (default: 2048)")
|
|
38
|
+
parser.add_argument("--device", help="device to place the model on, e.g. cuda or mps")
|
|
39
|
+
parser.add_argument("--dtype", default="auto", help="model dtype passed to transformers (default: auto)")
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--hf-token",
|
|
42
|
+
default=os.environ.get("HF_TOKEN"),
|
|
43
|
+
help="HuggingFace token for private/gated models; defaults to the HF_TOKEN env var or a cached huggingface-cli login",
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument("--version", action="version", version=f"benchleak {__version__}")
|
|
46
|
+
return parser
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def run(args: argparse.Namespace) -> int:
|
|
50
|
+
spec = resolve_spec(args.benchmark, config=args.config, split=args.split, fields=args.fields)
|
|
51
|
+
|
|
52
|
+
print(f"Loading model {args.model} ...", file=sys.stderr)
|
|
53
|
+
model, tokenizer = load_model(args.model, device=args.device, dtype=args.dtype, token=args.hf_token)
|
|
54
|
+
|
|
55
|
+
print(f"Loading benchmark {spec.path} ({spec.split}) ...", file=sys.stderr)
|
|
56
|
+
benchmark_texts = load_benchmark(spec, limit=args.limit)
|
|
57
|
+
reference_texts = load_reference(args.reference, limit=args.limit)
|
|
58
|
+
|
|
59
|
+
detector = MinKProbDetector(model, tokenizer, k=args.k, max_length=args.max_length)
|
|
60
|
+
print(f"Scoring {len(benchmark_texts)} benchmark + {len(reference_texts)} reference samples ...", file=sys.stderr)
|
|
61
|
+
result = scan(
|
|
62
|
+
detector,
|
|
63
|
+
benchmark_texts,
|
|
64
|
+
reference_texts,
|
|
65
|
+
detector_name="min-k% prob",
|
|
66
|
+
benchmark_name=args.benchmark,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
print(format_report(result, model_id=args.model))
|
|
70
|
+
return 1 if result.contaminated else 0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def main(argv: list[str] | None = None) -> int:
|
|
74
|
+
args = build_parser().parse_args(argv)
|
|
75
|
+
try:
|
|
76
|
+
return run(args)
|
|
77
|
+
except Exception as exc: # surface a clean message instead of a traceback
|
|
78
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
79
|
+
return 2
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
raise SystemExit(main())
|