benchleak 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(python3 -m venv .venv)",
5
+ "Bash(.venv/bin/python -m pip install --upgrade pip -q)",
6
+ "Bash(python3 -c \"import pypdf\")",
7
+ "Bash(python3 -c \"import pdfminer\")",
8
+ "Bash(.venv/bin/pip install *)",
9
+ "Bash(.venv/bin/python -c ' *)",
10
+ "Bash(.venv/bin/python *)",
11
+ "Bash(git add *)",
12
+ "Bash(git commit *)",
13
+ "Bash(git push *)",
14
+ "Bash(echo \"exit code: $?\")",
15
+ "Bash(brew --prefix xz)",
16
+ "Bash(pyenv --version)",
17
+ "Bash(echo \"venv python -> $\\(readlink -f .venv/bin/python 2>/dev/null || .venv/bin/python -c 'import sys;print\\(sys.executable\\)'\\)\")",
18
+ "Bash(LDFLAGS=\"-L/opt/homebrew/opt/xz/lib\" CPPFLAGS=\"-I/opt/homebrew/opt/xz/include\" pyenv install -f 3.10.11)",
19
+ "Bash(/Users/mouadbouchnaf/.pyenv/versions/3.10.11/bin/python3.10 -c \"import lzma; print\\('base lzma ok'\\)\")",
20
+ "Bash(grep -E \"reference.txt|data/|\\\\.py$|RECORD\")",
21
+ "Bash(.venv/bin/twine check *)",
22
+ "WebFetch(domain:pypi.org)",
23
+ "Bash(unzip -p dist/*.whl '*/METADATA')"
24
+ ]
25
+ }
26
+ }
@@ -0,0 +1,231 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
219
+
220
+ # benchleak
221
+ # JetBrains IDE config
222
+ .idea/
223
+ # Model weights and HuggingFace caches
224
+ *.safetensors
225
+ *.bin
226
+ .cache/huggingface/
227
+ # Reference papers (kept local, not committed)
228
+ papers/
229
+ # Generated contamination reports
230
+ reports/
231
+ *.DS_Store
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 bouchnam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: benchleak
3
+ Version: 0.1.0
4
+ Summary: Detect benchmark contamination in large language models
5
+ Project-URL: Homepage, https://github.com/bouchnam/benchleak
6
+ Project-URL: Repository, https://github.com/bouchnam/benchleak
7
+ Author: Mouad Bouchnaf
8
+ License: Apache-2.0
9
+ License-File: LICENSE
10
+ Keywords: benchmark,contamination,evaluation,llm,memorization
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.10
17
+ Requires-Dist: datasets>=2.18
18
+ Requires-Dist: numpy>=1.24
19
+ Requires-Dist: rich>=13.0
20
+ Requires-Dist: scipy>=1.11
21
+ Requires-Dist: torch>=2.0
22
+ Requires-Dist: transformers>=4.40
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=8.0; extra == 'dev'
25
+ Requires-Dist: ruff>=0.4; extra == 'dev'
26
+ Description-Content-Type: text/markdown
27
+
28
+ # benchleak 🔍
29
+
30
+ > **Did this model train on the test set? Find out in one command.**
31
+
32
+ When a model scores 90% on GSM8K or MATH, was it genuinely capable or did it
33
+ memorise the benchmark during training? Benchleak answers that with a
34
+ mathematical membership-inference test on the model's own token probabilities.
35
+ No LLM judges, no API calls, runs locally on any HuggingFace causal LM.
36
+
37
+ ## Status
38
+
39
+ 🚧 **Early alpha.** The **pre-training** detector (Min-K% Prob) is implemented and
40
+ runnable end to end. The SFT and RL-post-training detectors are planned but **not
41
+ yet built**, so don't expect them yet.
42
+
43
+ ## Install
44
+
45
+ ```bash
46
+ pip install benchleak
47
+ ```
48
+
49
+ This pulls in `torch`, `transformers`, and `datasets`. To work from a clone
50
+ instead, run `pip install -e .` in the repo root.
51
+
52
+ ## Usage
53
+
54
+ ```bash
55
+ benchleak --model Qwen/Qwen2.5-0.5B --benchmark gsm8k
56
+ ```
57
+
58
+ The model can be any HuggingFace-format causal LM, given as a Hub id or a local
59
+ checkpoint directory. GGUF, llama.cpp, and Ollama formats are not supported.
60
+
61
+ ### Private or gated models
62
+
63
+ For a repository that requires authentication, provide a HuggingFace token. Any
64
+ of these work:
65
+
66
+ ```bash
67
+ huggingface-cli login # cached credential, picked up automatically
68
+ export HF_TOKEN=hf_xxx # environment variable
69
+ benchleak --model my/private-model --benchmark gsm8k --hf-token hf_xxx
70
+ ```
71
+
72
+ ### Speed
73
+
74
+ Scoring runs one forward pass per sample. On a CPU-only machine the default
75
+ `--limit 200` can take many minutes; use a smaller `--limit` for a quick look, or
76
+ `--device cuda` / `--device mps` to use a GPU.
77
+
78
+ ```
79
+ benchleak: pre-training contamination report
80
+ ====================================================
81
+ Model: Qwen/Qwen2.5-0.5B
82
+ Benchmark: gsm8k
83
+ Detector: min-k% prob
84
+ Samples: 200 benchmark vs 200 reference
85
+
86
+ Separation (AUC): 0.71 [HIGH]
87
+ Significance (p): 3.2e-08
88
+ Flag thresholds: AUC >= 0.6, p < 0.05
89
+
90
+ Verdict: LIKELY CONTAMINATED
91
+ ```
92
+
93
+ Known benchmarks (`gsm8k`, `math`, `arc-challenge`, `truthfulqa`) work by name. For
94
+ any other Hub dataset, pass the path plus its text column(s):
95
+
96
+ ```bash
97
+ benchleak --model my/model --benchmark some/dataset --field question --field answer
98
+ ```
99
+
100
+ ## How it works
101
+
102
+ The benchmark is scored against a **reference set** of text the model is not
103
+ expected to have memorised. Min-K% Prob assigns each text the mean log-probability
104
+ of its least-likely *k%* of tokens. Memorised text has fewer surprising tokens
105
+ and scores higher. The tool then measures how strongly the benchmark's scores separate
106
+ from the reference's, reported as an AUC (`U / nm` from a Mann-Whitney test) with
107
+ a significance p-value. AUC ≈ 0.5 means the benchmark looks like fresh data; AUC
108
+ well above 0.5 is the memorisation signature of contamination.
109
+
110
+ A small reference set ships with the tool so it runs out of the box. For the
111
+ cleanest signal, supply your own domain-matched reference data with
112
+ `--reference my_reference.txt` (one passage per line).
113
+
114
+ For the full reasoning (why a reference set is needed, the choice of test, and
115
+ where the method can mislead), see [docs/how-it-works.md](docs/how-it-works.md).
116
+
117
+ | Phase | Method | Status |
118
+ |-------|--------|--------|
119
+ | Pre-training | Min-K% probability (Shi et al. 2024) | ✅ implemented |
120
+ | SFT | Self-prompt calibration (Fu et al. 2024) | ⏳ planned |
121
+ | RL post-training | Self-Critique entropy (Tao et al. 2025) | ⏳ planned |
122
+
123
+ ## Caveats
124
+
125
+ - A verdict needs **≥ 5 samples per side**; the significance test cannot reach
126
+ p < 0.05 below that.
127
+ - The bundled reference is general-domain prose. Comparing it against a
128
+ narrow-domain benchmark (e.g. math) can confound *domain* with *memorisation*;
129
+ prefer a domain-matched `--reference` for results you intend to publish.
130
+
131
+ ## Troubleshooting
132
+
133
+ **`ModuleNotFoundError: No module named '_lzma'`** when loading a benchmark. The
134
+ `datasets` library needs Python's `lzma` module, which is absent from some Python
135
+ builds (commonly pyenv on macOS compiled without the `xz` library). Install `xz`
136
+ and rebuild Python:
137
+
138
+ ```bash
139
+ brew install xz
140
+ LDFLAGS="-L$(brew --prefix xz)/lib" CPPFLAGS="-I$(brew --prefix xz)/include" \
141
+ pyenv install -f <your-python-version>
142
+ ```
143
+
144
+ Then reuse or recreate your virtual environment. benchleak detects this case and
145
+ prints the same guidance.
146
+
147
+ ## Citation
148
+
149
+ Implements methods from:
150
+ - Shi et al., 2024. *Detecting Pretraining Data from Large Language Models* (arXiv:2310.16789)
151
+ - Fu et al., 2024. *Membership Inference via Self-Prompt Calibration*
152
+ - Tao et al., 2025. *Detecting Data Contamination from RL Post-training* (arXiv:2510.09259)
153
+
154
+ ## License
155
+
156
+ Apache-2.0
@@ -0,0 +1,129 @@
1
+ # benchleak 🔍
2
+
3
+ > **Did this model train on the test set? Find out in one command.**
4
+
5
+ When a model scores 90% on GSM8K or MATH, was it genuinely capable or did it
6
+ memorise the benchmark during training? Benchleak answers that with a
7
+ mathematical membership-inference test on the model's own token probabilities.
8
+ No LLM judges, no API calls, runs locally on any HuggingFace causal LM.
9
+
10
+ ## Status
11
+
12
+ 🚧 **Early alpha.** The **pre-training** detector (Min-K% Prob) is implemented and
13
+ runnable end to end. The SFT and RL-post-training detectors are planned but **not
14
+ yet built**, so don't expect them yet.
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install benchleak
20
+ ```
21
+
22
+ This pulls in `torch`, `transformers`, and `datasets`. To work from a clone
23
+ instead, run `pip install -e .` in the repo root.
24
+
25
+ ## Usage
26
+
27
+ ```bash
28
+ benchleak --model Qwen/Qwen2.5-0.5B --benchmark gsm8k
29
+ ```
30
+
31
+ The model can be any HuggingFace-format causal LM, given as a Hub id or a local
32
+ checkpoint directory. GGUF, llama.cpp, and Ollama formats are not supported.
33
+
34
+ ### Private or gated models
35
+
36
+ For a repository that requires authentication, provide a HuggingFace token. Any
37
+ of these work:
38
+
39
+ ```bash
40
+ huggingface-cli login # cached credential, picked up automatically
41
+ export HF_TOKEN=hf_xxx # environment variable
42
+ benchleak --model my/private-model --benchmark gsm8k --hf-token hf_xxx
43
+ ```
44
+
45
+ ### Speed
46
+
47
+ Scoring runs one forward pass per sample. On a CPU-only machine the default
48
+ `--limit 200` can take many minutes; use a smaller `--limit` for a quick look, or
49
+ `--device cuda` / `--device mps` to use a GPU.
50
+
51
+ ```
52
+ benchleak: pre-training contamination report
53
+ ====================================================
54
+ Model: Qwen/Qwen2.5-0.5B
55
+ Benchmark: gsm8k
56
+ Detector: min-k% prob
57
+ Samples: 200 benchmark vs 200 reference
58
+
59
+ Separation (AUC): 0.71 [HIGH]
60
+ Significance (p): 3.2e-08
61
+ Flag thresholds: AUC >= 0.6, p < 0.05
62
+
63
+ Verdict: LIKELY CONTAMINATED
64
+ ```
65
+
66
+ Known benchmarks (`gsm8k`, `math`, `arc-challenge`, `truthfulqa`) work by name. For
67
+ any other Hub dataset, pass the path plus its text column(s):
68
+
69
+ ```bash
70
+ benchleak --model my/model --benchmark some/dataset --field question --field answer
71
+ ```
72
+
73
+ ## How it works
74
+
75
+ The benchmark is scored against a **reference set** of text the model is not
76
+ expected to have memorised. Min-K% Prob assigns each text the mean log-probability
77
+ of its least-likely *k%* of tokens. Memorised text has fewer surprising tokens
78
+ and scores higher. The tool then measures how strongly the benchmark's scores separate
79
+ from the reference's, reported as an AUC (`U / nm` from a Mann-Whitney test) with
80
+ a significance p-value. AUC ≈ 0.5 means the benchmark looks like fresh data; AUC
81
+ well above 0.5 is the memorisation signature of contamination.
82
+
83
+ A small reference set ships with the tool so it runs out of the box. For the
84
+ cleanest signal, supply your own domain-matched reference data with
85
+ `--reference my_reference.txt` (one passage per line).
86
+
87
+ For the full reasoning (why a reference set is needed, the choice of test, and
88
+ where the method can mislead), see [docs/how-it-works.md](docs/how-it-works.md).
89
+
90
+ | Phase | Method | Status |
91
+ |-------|--------|--------|
92
+ | Pre-training | Min-K% probability (Shi et al. 2024) | ✅ implemented |
93
+ | SFT | Self-prompt calibration (Fu et al. 2024) | ⏳ planned |
94
+ | RL post-training | Self-Critique entropy (Tao et al. 2025) | ⏳ planned |
95
+
96
+ ## Caveats
97
+
98
+ - A verdict needs **≥ 5 samples per side**; the significance test cannot reach
99
+ p < 0.05 below that.
100
+ - The bundled reference is general-domain prose. Comparing it against a
101
+ narrow-domain benchmark (e.g. math) can confound *domain* with *memorisation*;
102
+ prefer a domain-matched `--reference` for results you intend to publish.
103
+
104
+ ## Troubleshooting
105
+
106
+ **`ModuleNotFoundError: No module named '_lzma'`** when loading a benchmark. The
107
+ `datasets` library needs Python's `lzma` module, which is absent from some Python
108
+ builds (commonly pyenv on macOS compiled without the `xz` library). Install `xz`
109
+ and rebuild Python:
110
+
111
+ ```bash
112
+ brew install xz
113
+ LDFLAGS="-L$(brew --prefix xz)/lib" CPPFLAGS="-I$(brew --prefix xz)/include" \
114
+ pyenv install -f <your-python-version>
115
+ ```
116
+
117
+ Then reuse or recreate your virtual environment. benchleak detects this case and
118
+ prints the same guidance.
119
+
120
+ ## Citation
121
+
122
+ Implements methods from:
123
+ - Shi et al., 2024. *Detecting Pretraining Data from Large Language Models* (arXiv:2310.16789)
124
+ - Fu et al., 2024. *Membership Inference via Self-Prompt Calibration*
125
+ - Tao et al., 2025. *Detecting Data Contamination from RL Post-training* (arXiv:2510.09259)
126
+
127
+ ## License
128
+
129
+ Apache-2.0
@@ -0,0 +1,3 @@
1
+ """benchleak: detect benchmark contamination in large language models."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,83 @@
1
+ """Command line interface for benchleak.
2
+
3
+ benchleak --model Qwen/Qwen2.5-0.5B --benchmark gsm8k
4
+
5
+ Loads a HuggingFace model and a benchmark, scores both the benchmark and a
6
+ reference set with the Min-K% pre-training detector, and prints a contamination
7
+ report. Exits non-zero when the benchmark is flagged as likely contaminated.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import os
14
+ import sys
15
+
16
+ from . import __version__
17
+ from .core import scan
18
+ from .data import load_reference
19
+ from .detectors.pretrain import DEFAULT_K, MinKProbDetector
20
+ from .loading import load_benchmark, load_model, resolve_spec
21
+ from .report import format_report
22
+
23
+
24
+ def build_parser() -> argparse.ArgumentParser:
25
+ parser = argparse.ArgumentParser(
26
+ prog="benchleak",
27
+ description="Detect pre-training benchmark contamination in an LLM (Min-K% Prob).",
28
+ )
29
+ parser.add_argument("--model", required=True, help="HuggingFace model id, e.g. Qwen/Qwen2.5-0.5B")
30
+ parser.add_argument("--benchmark", required=True, help="benchmark name (gsm8k, math, ...) or a Hub dataset path")
31
+ parser.add_argument("--reference", help="path to a reference-text file (one passage per line); defaults to the bundled set")
32
+ parser.add_argument("--field", action="append", dest="fields", help="benchmark text column(s); repeatable. Required for an unknown benchmark")
33
+ parser.add_argument("--config", help="dataset config/subset name")
34
+ parser.add_argument("--split", help="dataset split (default depends on the benchmark)")
35
+ parser.add_argument("--limit", type=int, default=200, help="max samples per side (default: 200)")
36
+ parser.add_argument("--k", type=float, default=DEFAULT_K, help=f"Min-K%% percentage (default: {DEFAULT_K})")
37
+ parser.add_argument("--max-length", type=int, default=2048, help="truncate texts to this many tokens (default: 2048)")
38
+ parser.add_argument("--device", help="device to place the model on, e.g. cuda or mps")
39
+ parser.add_argument("--dtype", default="auto", help="model dtype passed to transformers (default: auto)")
40
+ parser.add_argument(
41
+ "--hf-token",
42
+ default=os.environ.get("HF_TOKEN"),
43
+ help="HuggingFace token for private/gated models; defaults to the HF_TOKEN env var or a cached huggingface-cli login",
44
+ )
45
+ parser.add_argument("--version", action="version", version=f"benchleak {__version__}")
46
+ return parser
47
+
48
+
49
+ def run(args: argparse.Namespace) -> int:
50
+ spec = resolve_spec(args.benchmark, config=args.config, split=args.split, fields=args.fields)
51
+
52
+ print(f"Loading model {args.model} ...", file=sys.stderr)
53
+ model, tokenizer = load_model(args.model, device=args.device, dtype=args.dtype, token=args.hf_token)
54
+
55
+ print(f"Loading benchmark {spec.path} ({spec.split}) ...", file=sys.stderr)
56
+ benchmark_texts = load_benchmark(spec, limit=args.limit)
57
+ reference_texts = load_reference(args.reference, limit=args.limit)
58
+
59
+ detector = MinKProbDetector(model, tokenizer, k=args.k, max_length=args.max_length)
60
+ print(f"Scoring {len(benchmark_texts)} benchmark + {len(reference_texts)} reference samples ...", file=sys.stderr)
61
+ result = scan(
62
+ detector,
63
+ benchmark_texts,
64
+ reference_texts,
65
+ detector_name="min-k% prob",
66
+ benchmark_name=args.benchmark,
67
+ )
68
+
69
+ print(format_report(result, model_id=args.model))
70
+ return 1 if result.contaminated else 0
71
+
72
+
73
+ def main(argv: list[str] | None = None) -> int:
74
+ args = build_parser().parse_args(argv)
75
+ try:
76
+ return run(args)
77
+ except Exception as exc: # surface a clean message instead of a traceback
78
+ print(f"error: {exc}", file=sys.stderr)
79
+ return 2
80
+
81
+
82
+ if __name__ == "__main__":
83
+ raise SystemExit(main())