lean-probe 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lean_probe-0.2.0/LICENSE +21 -0
- lean_probe-0.2.0/PKG-INFO +641 -0
- lean_probe-0.2.0/README.md +604 -0
- lean_probe-0.2.0/pyproject.toml +68 -0
- lean_probe-0.2.0/setup.cfg +4 -0
- lean_probe-0.2.0/src/lean_probe/__init__.py +25 -0
- lean_probe-0.2.0/src/lean_probe/benchmark.py +1540 -0
- lean_probe-0.2.0/src/lean_probe/cli.py +312 -0
- lean_probe-0.2.0/src/lean_probe/core.py +1377 -0
- lean_probe-0.2.0/src/lean_probe/mcp_server.py +270 -0
- lean_probe-0.2.0/src/lean_probe/py.typed +0 -0
- lean_probe-0.2.0/src/lean_probe.egg-info/PKG-INFO +641 -0
- lean_probe-0.2.0/src/lean_probe.egg-info/SOURCES.txt +20 -0
- lean_probe-0.2.0/src/lean_probe.egg-info/dependency_links.txt +1 -0
- lean_probe-0.2.0/src/lean_probe.egg-info/entry_points.txt +2 -0
- lean_probe-0.2.0/src/lean_probe.egg-info/requires.txt +14 -0
- lean_probe-0.2.0/src/lean_probe.egg-info/top_level.txt +1 -0
- lean_probe-0.2.0/tests/test_benchmark.py +325 -0
- lean_probe-0.2.0/tests/test_cli.py +213 -0
- lean_probe-0.2.0/tests/test_core.py +726 -0
- lean_probe-0.2.0/tests/test_integration.py +37 -0
- lean_probe-0.2.0/tests/test_mcp_server.py +133 -0
lean_probe-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Lazar Milikic
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,641 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lean-probe
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Fast Lean 4 proof feedback for agents, powered by LeanInteract.
|
|
5
|
+
Author: LeanProbe contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Lemmy00/LeanProbe
|
|
8
|
+
Project-URL: Documentation, https://github.com/Lemmy00/LeanProbe#readme
|
|
9
|
+
Project-URL: Issues, https://github.com/Lemmy00/LeanProbe/issues
|
|
10
|
+
Project-URL: Source, https://github.com/Lemmy00/LeanProbe
|
|
11
|
+
Keywords: Lean,Lean 4,theorem proving,MCP,agents,LeanInteract
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: lean-interact<1,>=0.11.0
|
|
25
|
+
Provides-Extra: mcp
|
|
26
|
+
Requires-Dist: mcp<2,>=1.2.0; extra == "mcp"
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: build; extra == "dev"
|
|
29
|
+
Requires-Dist: mcp<2,>=1.2.0; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy; extra == "dev"
|
|
31
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-xdist; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff; extra == "dev"
|
|
35
|
+
Requires-Dist: twine; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# LeanProbe
|
|
39
|
+
|
|
40
|
+
LeanProbe is a standalone Python package, CLI, and MCP server for fast Lean 4
|
|
41
|
+
feedback when a tool repeatedly checks declarations in the same Lean project.
|
|
42
|
+
It uses [LeanInteract](https://github.com/augustepoiroux/LeanInteract) as its
|
|
43
|
+
execution backend, keeps a Lean REPL warm, reuses elaborated imports and prior
|
|
44
|
+
declarations, and checks a named target declaration or replacement chunk.
|
|
45
|
+
|
|
46
|
+
LeanProbe returns Lean diagnostics, warnings, `sorry` detection, tactic
|
|
47
|
+
metadata, goal states, and inline `feedback_lean`. The result is a real Lean
|
|
48
|
+
response for the checked chunk and prepared environment. Use `lake env lean
|
|
49
|
+
File.lean`, `lake build`, or CI when you need whole-file or whole-project
|
|
50
|
+
acceptance.
|
|
51
|
+
|
|
52
|
+
## MCP Tools
|
|
53
|
+
|
|
54
|
+
LeanProbe exposes the MCP server name `lean-probe` and the tools
|
|
55
|
+
`lean_probe_prepare`, `lean_probe_check`, `lean_probe_feedback`,
|
|
56
|
+
`lean_probe_state`, `lean_probe_step`, and `lean_probe_close_state`.
|
|
57
|
+
|
|
58
|
+
For MCP parameter details, result-field semantics, and `feedback_lean` examples,
|
|
59
|
+
see [AGENT.md](AGENT.md).
|
|
60
|
+
|
|
61
|
+
## Why It Is Faster
|
|
62
|
+
|
|
63
|
+
Many Lean workflows perform several related checks in one file: check a
|
|
64
|
+
candidate declaration, inspect diagnostics or proof state, try another
|
|
65
|
+
candidate, then move to a nearby declaration. A repeated full-file terminal
|
|
66
|
+
check pays import, header, and prior-declaration elaboration cost each time.
|
|
67
|
+
|
|
68
|
+
LeanProbe separates that cost:
|
|
69
|
+
|
|
70
|
+
```text
|
|
71
|
+
prepare header/imports/prior declarations -> env before target
|
|
72
|
+
env before target + checked declaration -> diagnostics/proof states
|
|
73
|
+
env before target + next checked declaration -> diagnostics/proof states
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
For sequential same-file checks, "environment" means Lean's elaborated state
|
|
77
|
+
after processing some prefix of the file. It is not just the import/header
|
|
78
|
+
state. The state grows only when a declaration is accepted:
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
imports/header -> env0
|
|
82
|
+
env0 + declaration t1 -> env1 # env1 contains imports/header and t1
|
|
83
|
+
env1 + declaration t2 -> env2 # env2 contains imports/header, t1, and t2
|
|
84
|
+
env2 + declaration t3 -> env3
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
If a tool is trying several replacements for `t2`, each attempt should reuse
|
|
88
|
+
`env1`; failed attempts do not advance the environment. Once the complete `t2`
|
|
89
|
+
is accepted, LeanProbe can use `env2` for later declarations instead of
|
|
90
|
+
rechecking imports, `t1`, and `t2` from scratch.
|
|
91
|
+
|
|
92
|
+
The benchmark suite measures two cases:
|
|
93
|
+
|
|
94
|
+
- repeated target checks: prepare the environment before one declaration, then
|
|
95
|
+
repeatedly check replacements for that declaration;
|
|
96
|
+
- sequential same-file checks: prepare a header once, then advance declaration by
|
|
97
|
+
declaration with env reuse.
|
|
98
|
+
|
|
99
|
+
## Install
|
|
100
|
+
|
|
101
|
+
LeanProbe is a Python package that talks to Lean through LeanInteract. `pip`
|
|
102
|
+
installs LeanProbe's Python dependencies, including `lean-interact`. It does
|
|
103
|
+
not install Lean, Lake, or Mathlib; those belong to the Lean toolchain and the
|
|
104
|
+
Lake project being checked. `lake` must be available on `PATH` or passed with
|
|
105
|
+
`--lake-path`.
|
|
106
|
+
|
|
107
|
+
Required:
|
|
108
|
+
|
|
109
|
+
- Python 3.10 or newer.
|
|
110
|
+
- Lean 4 and Lake installed through
|
|
111
|
+
[elan](https://github.com/leanprover/elan).
|
|
112
|
+
- `git`, used by Lean/Lake dependency workflows.
|
|
113
|
+
- A Lean/Lake project to run checks in. For the bundled examples, that project
|
|
114
|
+
must have Mathlib available because the examples start with `import Mathlib`.
|
|
115
|
+
- A built Lean project, or `--auto-build` when you want LeanInteract to build it
|
|
116
|
+
before checking.
|
|
117
|
+
|
|
118
|
+
Install the CLI and Python package:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
python -m pip install lean-probe
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
That command installs the required Python runtime dependencies. If
|
|
125
|
+
`python -c "import lean_probe, lean_interact"` fails, run the install command in
|
|
126
|
+
the same Python environment that will launch LeanProbe.
|
|
127
|
+
|
|
128
|
+
Install MCP support when you want to run the MCP server:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
python -m pip install "lean-probe[mcp]"
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Editable checkout for development:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
python -m venv .venv
|
|
138
|
+
source .venv/bin/activate
|
|
139
|
+
python -m pip install -U pip
|
|
140
|
+
python -m pip install -e ".[dev]"
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Check the Python package and CLI:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
python -c "import lean_probe, lean_interact; print('ok')"
|
|
147
|
+
lean-probe --version # lean-probe 0.2.0
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Check that Lean/Lake are visible:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
lake --version
|
|
154
|
+
lean --version
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Run LeanProbe by pointing `--cwd` at a Lake project that can import the
|
|
158
|
+
dependencies used by the file being checked:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
lean-probe check examples/lean/number_theory_nat.lean nat_mul_pos_bench \
|
|
162
|
+
--cwd /path/to/mathlib-lake-project \
|
|
163
|
+
--pretty
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
If the target project does not already have LeanInteract's REPL support built,
|
|
167
|
+
either let LeanInteract build it with `--auto-build` or pass an existing REPL
|
|
168
|
+
checkout with `--local-repl-path`.
|
|
169
|
+
If `--cwd` is supplied, it must be inside a Lake project; otherwise LeanProbe
|
|
170
|
+
returns `error_code="no_project_root"`.
|
|
171
|
+
|
|
172
|
+
For MCP use, configure the MCP client to run `lean-probe mcp` from this same
|
|
173
|
+
Python environment. If the client launches servers outside your activated
|
|
174
|
+
shell, use the absolute path to `.venv/bin/lean-probe` in the MCP
|
|
175
|
+
configuration.
|
|
176
|
+
Set `LEAN_PROBE_LAKE_PATH`, `LEAN_PROBE_LOCAL_REPL_PATH`,
|
|
177
|
+
`LEAN_PROBE_AUTO_BUILD`, or `LEAN_PROBE_VERBOSE` to configure the MCP server
|
|
178
|
+
without CLI flags.
|
|
179
|
+
|
|
180
|
+
After an editable development install, run the package tests from the
|
|
181
|
+
repository with `python -m pytest -q`.
|
|
182
|
+
|
|
183
|
+
## CLI
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
lean-probe prepare /path/to/File.lean --cwd /path/to/lake-project --theorem-id my_theorem
|
|
187
|
+
|
|
188
|
+
lean-probe check /path/to/File.lean my_theorem \
|
|
189
|
+
--cwd /path/to/lake-project \
|
|
190
|
+
--replacement-file /tmp/candidate.lean \
|
|
191
|
+
--pretty
|
|
192
|
+
|
|
193
|
+
lean-probe feedback /path/to/File.lean my_theorem \
|
|
194
|
+
--cwd /path/to/lake-project \
|
|
195
|
+
--pretty
|
|
196
|
+
|
|
197
|
+
lean-probe benchmark /path/to/File.lean my_theorem \
|
|
198
|
+
--cwd /path/to/lake-project \
|
|
199
|
+
--runs 5 --warmups 1 --include-feedback --include-no-cache \
|
|
200
|
+
--external-command 'lake-direct=lake env lean {file}' \
|
|
201
|
+
--pretty
|
|
202
|
+
|
|
203
|
+
lean-probe benchmark-suite \
|
|
204
|
+
--cases-file examples/benchmark_cases.json \
|
|
205
|
+
--cwd /path/to/mathlib-lake-project \
|
|
206
|
+
--runs 5 --warmups 1 --include-feedback --include-no-cache \
|
|
207
|
+
--pretty
|
|
208
|
+
|
|
209
|
+
lean-probe benchmark-file /path/to/File.lean \
|
|
210
|
+
--cwd /path/to/lake-project \
|
|
211
|
+
--runs 3 \
|
|
212
|
+
--pretty
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
`--include-no-cache` times a fresh LeanProbe/LeanInteract server per attempt.
|
|
216
|
+
Use it to quantify the cost of running without persistent environment reuse.
|
|
217
|
+
|
|
218
|
+
Use `--external-command NAME=COMMAND` to time another verifier or wrapper
|
|
219
|
+
against the same candidate files. The command runs from `--cwd`; placeholders are
|
|
220
|
+
`{file}` for the temp full file, `{original}` for the source file, `{cwd}` for
|
|
221
|
+
the Lake project root, and `{theorem}` for the target declaration.
|
|
222
|
+
|
|
223
|
+
## Python
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
from lean_probe import LeanProbe
|
|
227
|
+
|
|
228
|
+
probe = LeanProbe()
|
|
229
|
+
probe.prepare_file("/path/to/File.lean", cwd="/path/to/lake-project", theorem_id="my_theorem")
|
|
230
|
+
|
|
231
|
+
result = probe.check_target(
|
|
232
|
+
"/path/to/File.lean",
|
|
233
|
+
cwd="/path/to/lake-project",
|
|
234
|
+
theorem_id="my_theorem",
|
|
235
|
+
replacement="""
|
|
236
|
+
theorem my_theorem : True := by
|
|
237
|
+
trivial
|
|
238
|
+
""",
|
|
239
|
+
)
|
|
240
|
+
print(result["ok"], result["elapsed_s"])
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
For tactic-by-tactic exploration:
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
state = probe.proof_state_from_code("theorem ex (n : Nat) : n = n := by sorry")
|
|
247
|
+
proof_state = state["sorries"][0]["proof_state"]
|
|
248
|
+
step = probe.tactic_step(state["session_id"], proof_state, "rfl")
|
|
249
|
+
print(step["proof_status"])
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## MCP
|
|
253
|
+
|
|
254
|
+
Run the MCP server over stdio:
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
lean-probe mcp
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
Example MCP configuration:
|
|
261
|
+
|
|
262
|
+
```json
|
|
263
|
+
{
|
|
264
|
+
"mcpServers": {
|
|
265
|
+
"lean-probe": {
|
|
266
|
+
"command": "lean-probe",
|
|
267
|
+
"args": ["mcp"]
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
Example MCP configuration with LeanProbe environment variables:
|
|
274
|
+
|
|
275
|
+
```json
|
|
276
|
+
{
|
|
277
|
+
"mcpServers": {
|
|
278
|
+
"lean-probe": {
|
|
279
|
+
"command": "lean-probe",
|
|
280
|
+
"args": ["mcp"],
|
|
281
|
+
"env": {
|
|
282
|
+
"LEAN_PROBE_LAKE_PATH": "/opt/homebrew/bin/lake",
|
|
283
|
+
"LEAN_PROBE_AUTO_BUILD": "1"
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Use `lean_probe_prepare` before repeated checks in the same file, then call
|
|
291
|
+
`lean_probe_check` for concrete target declarations or replacements. When
|
|
292
|
+
ordinary diagnostics are not enough, call `lean_probe_feedback` and inspect
|
|
293
|
+
`messages`, `tactics`, and `feedback_lean`. See [AGENT.md](AGENT.md) for the
|
|
294
|
+
full MCP contract.
|
|
295
|
+
|
|
296
|
+
## Benchmark Files
|
|
297
|
+
|
|
298
|
+
LeanProbe ships standalone Mathlib benchmark examples under `examples/lean/`.
|
|
299
|
+
The compact files are hand-written smoke and micro-benchmark cases. The
|
|
300
|
+
`tcs_*` files are longer extracts from the
|
|
301
|
+
[CodaBench TCS Proving competition](https://www.codabench.org/competitions/16161/).
|
|
302
|
+
The concrete Lean source was taken from the public companion repository
|
|
303
|
+
[epfl-lara/icml-26-lean-challenges](https://github.com/epfl-lara/icml-26-lean-challenges),
|
|
304
|
+
with source headers retained. These files exercise more realistic algorithm and
|
|
305
|
+
graph-development code without adding a runtime dependency on either source.
|
|
306
|
+
Run all examples from any existing Mathlib Lake project by passing that project
|
|
307
|
+
as `--cwd`.
|
|
308
|
+
|
|
309
|
+
| File | Targets |
|
|
310
|
+
| --- | --- |
|
|
311
|
+
| `examples/lean/analysis_real.lean` | `abs_sub_le_abs_add_abs`, `abs_abs_sub_abs_le_abs_sub`, `dist_triangle_real`, `lipschitz_abs_one`, `continuous_shifted_square` |
|
|
312
|
+
| `examples/lean/algebra_order.lean` | `sq_add_sq_nonneg`, `two_mul_le_sq_add_sq`, `sq_sub_sq_factor`, `cube_add_expansion`, `square_le_self_on_unit_interval` |
|
|
313
|
+
| `examples/lean/sets_functions.lean` | `preimage_inter_eq`, `preimage_subset_preimage`, `image_subset_of_mapsTo`, `injective_from_left_inverse`, `surjective_from_right_inverse` |
|
|
314
|
+
| `examples/lean/number_theory_nat.lean` | `nat_add_cancel_bench`, `nat_mul_pos_bench`, `nat_mod_lt_bench`, `nat_square_eq_mul`, `nat_dvd_trans_bench` |
|
|
315
|
+
| `examples/lean/tcs_binary_heap.lean` | selected binary heap definitions such as `heapify`, `extract_min`, `insert`, `merge`, and `remove` |
|
|
316
|
+
| `examples/lean/tcs_treap_analysis.lean` | `uniform_prob_sum_one`, `perm_prob_sum_one` |
|
|
317
|
+
| `examples/lean/tcs_weighted_graph_prefix.lean` | selected weighted graph helpers and definitions through `Sym2order` |
|
|
318
|
+
|
|
319
|
+
The suite file `examples/benchmark_cases.json` lists all 40 targets with labels,
|
|
320
|
+
groups, sizes, and descriptions. Use `--results-dir` when you want to save raw
|
|
321
|
+
benchmark JSON for later analysis.
|
|
322
|
+
|
|
323
|
+
## Verification Surfaces
|
|
324
|
+
|
|
325
|
+
The built-in benchmarks compare these verification surfaces:
|
|
326
|
+
|
|
327
|
+
- terminal `lake env lean`: canonical full-file verification of a temp file
|
|
328
|
+
containing the candidate replacement;
|
|
329
|
+
- Probe prepare: wall-clock time to build env before the target;
|
|
330
|
+
- Probe cached check: target declaration only, using cached env before target;
|
|
331
|
+
- Probe cached feedback: same target check with tactic/proof-state metadata;
|
|
332
|
+
- Probe fresh check: fresh LeanProbe/LeanInteract server per attempt;
|
|
333
|
+
- same-file Lake growing-prefix checks: for each partial/full scenario, temp
|
|
334
|
+
file with header plus accepted prior declarations plus the current
|
|
335
|
+
declaration;
|
|
336
|
+
- same-file Lake full-file checks: for each partial/full scenario, temp file
|
|
337
|
+
containing the whole source file with only the current declaration replaced;
|
|
338
|
+
- same-file Probe cached checks: one LeanInteract server reuses header and
|
|
339
|
+
prior declaration environments across partial/full scenarios;
|
|
340
|
+
- same-file Probe fresh checks: fresh LeanProbe/LeanInteract server per
|
|
341
|
+
scenario;
|
|
342
|
+
- optional external command: any user-provided shell verifier or wrapper timed
|
|
343
|
+
with the same temp full file.
|
|
344
|
+
|
|
345
|
+
Lean LSP, MCP, and proof-context tools are diagnostic surfaces. Compare
|
|
346
|
+
project-specific wrappers through `--external-command` or an out-of-tree adapter
|
|
347
|
+
that exits nonzero on hard failure; LeanProbe itself stays independent.
|
|
348
|
+
|
|
349
|
+
## Benchmark Experiments
|
|
350
|
+
|
|
351
|
+
The README reports two benchmark shapes. They answer different questions.
|
|
352
|
+
|
|
353
|
+
### Repeated Target Checks
|
|
354
|
+
|
|
355
|
+
Measures repeated complete-replacement checks for one declaration after the
|
|
356
|
+
environment before that declaration has been prepared.
|
|
357
|
+
|
|
358
|
+
Per target, the benchmark does this:
|
|
359
|
+
|
|
360
|
+
1. Build a temporary full file containing the candidate replacement and time
|
|
361
|
+
`lake env lean`.
|
|
362
|
+
2. Start LeanProbe, prepare the environment before the target declaration, and
|
|
363
|
+
report that time as `Probe prepare avg` or `Probe prepare env`.
|
|
364
|
+
3. Check the target replacement against that cached environment and report that
|
|
365
|
+
time as `Probe cached check avg` or `Probe cached check`.
|
|
366
|
+
4. Optionally request tactic/proof-state metadata and report that as
|
|
367
|
+
`Probe cached feedback avg` or `Probe cached feedback`.
|
|
368
|
+
5. Repeat the LeanProbe check with a fresh server and no cache reuse to show
|
|
369
|
+
what LeanInteract costs without persistent state.
|
|
370
|
+
|
|
371
|
+
The important total for repeated attempts is:
|
|
372
|
+
|
|
373
|
+
```text
|
|
374
|
+
Probe total for n attempts = prepare time + n * cached check time
|
|
375
|
+
Lake total for n attempts = n * full-file Lake time
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
`Attempts to beat Lake` is the smallest integer `n` where the Probe total is
|
|
379
|
+
lower than the Lake total. `Amortized speedup, 3 attempts` and
|
|
380
|
+
`Amortized speedup, 10 attempts` use the same formula at fixed attempt counts.
|
|
381
|
+
|
|
382
|
+
### Sequential Same-File Checks
|
|
383
|
+
|
|
384
|
+
Measures repeated checks across nearby declarations in one file, where a checker
|
|
385
|
+
can reuse the file-local environment instead of starting over for each scenario.
|
|
386
|
+
|
|
387
|
+
For each targetable declaration in the file, the benchmark checks the complete
|
|
388
|
+
declaration. When the declaration has a `:= by` proof body, it also checks a
|
|
389
|
+
partial scenario:
|
|
390
|
+
|
|
391
|
+
1. a partial version containing `sorry`, which should be accepted by Lean with
|
|
392
|
+
`sorry` detected;
|
|
393
|
+
2. the complete version, which must be accepted without `sorry`.
|
|
394
|
+
|
|
395
|
+
LeanProbe advances from `env before this declaration` to `env after this
|
|
396
|
+
declaration` only after the complete version succeeds. A partial or failing
|
|
397
|
+
scenario is reported, but it is not added to the cached state used for later
|
|
398
|
+
declarations. The Lake baselines rerun terminal checks for each scenario:
|
|
399
|
+
|
|
400
|
+
- `Lake growing-prefix total`: `lake env lean` on a temp file containing the
|
|
401
|
+
header, already accepted prior declarations, and the current scenario.
|
|
402
|
+
- `Lake full-file total`: `lake env lean` on a temp full file where only the
|
|
403
|
+
current declaration is replaced by the current scenario.
|
|
404
|
+
- `Probe cached total`: one LeanProbe/LeanInteract server reusing header and
|
|
405
|
+
prior-declaration environments across all scenarios.
|
|
406
|
+
- `Probe fresh total`: a fresh LeanProbe/LeanInteract server for every
|
|
407
|
+
scenario, showing the cost when there is no cache reuse.
|
|
408
|
+
|
|
409
|
+
## Current Results
|
|
410
|
+
|
|
411
|
+
Snapshot refreshed: May 13, 2026.
|
|
412
|
+
|
|
413
|
+
- Lean used for this snapshot: `4.30.0-rc2`
|
|
414
|
+
(`3dc1a088b6d2d8eafe25a7cd7ec7b58d731bd7cc`). Treat the tables as benchmark
|
|
415
|
+
context for that toolchain snapshot; rerun the benchmark suite for exact
|
|
416
|
+
numbers on a different Lean release or machine.
|
|
417
|
+
- In the tables below, `Probe` means LeanProbe.
|
|
418
|
+
|
|
419
|
+
| Environment | Machine | CPU / SoC | Cores / threads | Memory | Runtime and CPU details |
|
|
420
|
+
| --- | --- | --- | ---: | ---: | --- |
|
|
421
|
+
| macOS | MacBook Pro `Mac16,7` | Apple M4 Pro | 14 cores, no SMT reported; 10 performance + 4 efficiency | 24 GB unified memory | Darwin 25.4.0, arm64, Python 3.12.12 |
|
|
422
|
+
| Linux workstation | single-socket workstation | Intel Core i7-14700KF | 20 cores / 28 threads | 62 GiB RAM, 8 GiB swap | max 5.6 GHz, L2 28 MiB, L3 33 MiB, Linux 6.8.0-111-generic, Python 3.13.9 |
|
|
423
|
+
|
|
424
|
+
Run policy for repeated-target tables: 1 measured run per target, 0 benchmark
|
|
425
|
+
warmups, warm Lake caches from prior example validation, feedback enabled, and
|
|
426
|
+
fresh-server baseline enabled. Prepare time is shown separately and included in
|
|
427
|
+
break-even and amortized speedups. The Lake baseline writes a temp full file and
|
|
428
|
+
runs `lake env lean`.
|
|
429
|
+
|
|
430
|
+
### Repeated Target Checks
|
|
431
|
+
|
|
432
|
+
macOS:
|
|
433
|
+
|
|
434
|
+
| Example group | Targets | Lake full-file avg | Probe prepare avg | Probe cached check avg | Probe cached feedback avg | Probe fresh check avg | Fresh check / cached check |
|
|
435
|
+
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
436
|
+
| `analysis_real` | 5 | 3.893s | 6.024s | 0.039s | 0.022s | 4.014s | 139.7x |
|
|
437
|
+
| `algebra_order` | 5 | 3.900s | 3.683s | 0.048s | 0.039s | 3.987s | 106.7x |
|
|
438
|
+
| `sets_functions` | 5 | 3.708s | 3.502s | 0.008s | 0.007s | 3.766s | 454.7x |
|
|
439
|
+
| `number_theory_nat` | 5 | 3.731s | 3.478s | 0.011s | 0.006s | 3.776s | 420.0x |
|
|
440
|
+
|
|
441
|
+
Linux:
|
|
442
|
+
|
|
443
|
+
| Example group | Targets | Lake full-file avg | Probe prepare avg | Probe cached check avg | Probe cached feedback avg | Probe fresh check avg | Fresh check / cached check |
|
|
444
|
+
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
445
|
+
| `analysis_real` | 5 | 2.276s | 2.315s | 0.025s | 0.024s | 2.412s | 103.2x |
|
|
446
|
+
| `algebra_order` | 5 | 2.301s | 2.317s | 0.046s | 0.043s | 2.516s | 78.2x |
|
|
447
|
+
| `sets_functions` | 5 | 2.233s | 2.257s | 0.011s | 0.009s | 2.383s | 245.8x |
|
|
448
|
+
| `number_theory_nat` | 5 | 2.199s | 2.217s | 0.009s | 0.008s | 2.390s | 322.0x |
|
|
449
|
+
|
|
450
|
+
Column guide for repeated-target summary tables:
|
|
451
|
+
|
|
452
|
+
- `Lake full-file avg`: average wall time to write a temp full file with the
|
|
453
|
+
target declaration replaced, then run `lake env lean` on that file.
|
|
454
|
+
- `Probe prepare avg`: average wall time for `lean_probe_prepare`; this
|
|
455
|
+
warms imports/header and declarations before the target.
|
|
456
|
+
- `Probe cached check avg`: average `lean_probe_check` time after prepare,
|
|
457
|
+
checking only the target declaration against the cached environment.
|
|
458
|
+
- `Probe cached feedback avg`: average `lean_probe_feedback` time after prepare,
|
|
459
|
+
including diagnostics plus tactic/proof-state metadata.
|
|
460
|
+
- `Probe fresh check avg`: average time for the same target check with a new
|
|
461
|
+
LeanProbe/LeanInteract server and no prior cache reuse.
|
|
462
|
+
- `Fresh check / cached check`: `Probe fresh check avg / Probe cached check avg`;
|
|
463
|
+
larger values mean cache reuse matters more.
|
|
464
|
+
|
|
465
|
+
Per-target repeated-check rows are in [BENCHMARKS.md](BENCHMARKS.md).
|
|
466
|
+
|
|
467
|
+
### TCS Challenge Repeated Target Checks
|
|
468
|
+
|
|
469
|
+
Run policy: same as the compact repeated-target tables above. These rows cover
|
|
470
|
+
the 20 longer examples derived from the CodaBench TCS Proving source material.
|
|
471
|
+
|
|
472
|
+
Grouped summary:
|
|
473
|
+
|
|
474
|
+
| Platform | Example group | Targets | Lake full-file avg | Probe prepare avg | Probe cached check avg | Probe cached feedback avg | Probe fresh check avg | Fresh check / cached check |
|
|
475
|
+
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
476
|
+
| macOS | `tcs_binary_heap` | 9 | 2.576s | 2.931s | 0.049s | 0.042s | 2.589s | 155.9x |
|
|
477
|
+
| macOS | `tcs_treap_analysis` | 2 | 2.082s | 2.219s | 0.034s | 0.034s | 2.181s | 77.8x |
|
|
478
|
+
| macOS | `tcs_weighted_graph` | 9 | 2.617s | 2.461s | 0.031s | 0.028s | 2.603s | 194.9x |
|
|
479
|
+
| Linux | `tcs_binary_heap` | 9 | 1.886s | 1.807s | 0.054s | 0.051s | 1.877s | 103.2x |
|
|
480
|
+
| Linux | `tcs_treap_analysis` | 2 | 1.495s | 1.441s | 0.036s | 0.040s | 1.560s | 53.1x |
|
|
481
|
+
| Linux | `tcs_weighted_graph` | 9 | 1.771s | 1.683s | 0.032s | 0.034s | 1.761s | 127.5x |
|
|
482
|
+
|
|
483
|
+
Per-target TCS rows are in [BENCHMARKS.md](BENCHMARKS.md).
|
|
484
|
+
|
|
485
|
+
### Sequential Same-File Checks
|
|
486
|
+
|
|
487
|
+
Run policy: 1 measured run per file, sequential execution, 5 declarations per
|
|
488
|
+
file. This benchmark models a file-level checking session:
|
|
489
|
+
|
|
490
|
+
1. check imports/header;
|
|
491
|
+
2. for each targetable declaration with a `:= by` proof body, check a partial
|
|
492
|
+
`sorry` version and confirm `sorry` is detected without hard errors;
|
|
493
|
+
3. check the full declaration and require valid-without-sorry;
|
|
494
|
+
4. advance the cached environment only after the full declaration succeeds.
|
|
495
|
+
|
|
496
|
+
| Platform | File | Declarations | Scenarios | Lake growing-prefix total | Lake full-file total | Probe cached total | Probe fresh total | Speedup vs growing-prefix Lake | Speedup vs full-file Lake | Speedup vs fresh Probe |
|
|
497
|
+
| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
498
|
+
| macOS | `analysis_real.lean` | 5 | 10 | 67.992s | 40.216s | 4.775s | 44.699s | 14.24x | 8.42x | 9.36x |
|
|
499
|
+
| macOS | `algebra_order.lean` | 5 | 10 | 46.870s | 48.163s | 4.339s | 40.953s | 10.80x | 11.10x | 9.44x |
|
|
500
|
+
| macOS | `sets_functions.lean` | 5 | 10 | 45.421s | 42.237s | 3.916s | 37.797s | 11.60x | 10.79x | 9.65x |
|
|
501
|
+
| macOS | `number_theory_nat.lean` | 5 | 10 | 36.474s | 36.648s | 3.789s | 36.736s | 9.63x | 9.67x | 9.70x |
|
|
502
|
+
| Linux | `analysis_real.lean` | 5 | 10 | 22.765s | 22.958s | 2.515s | 24.890s | 9.05x | 9.13x | 9.90x |
|
|
503
|
+
| Linux | `algebra_order.lean` | 5 | 10 | 23.186s | 23.489s | 2.547s | 25.147s | 9.10x | 9.22x | 9.87x |
|
|
504
|
+
| Linux | `sets_functions.lean` | 5 | 10 | 22.679s | 22.567s | 2.384s | 24.195s | 9.51x | 9.47x | 10.15x |
|
|
505
|
+
| Linux | `number_theory_nat.lean` | 5 | 10 | 22.593s | 22.829s | 2.301s | 24.086s | 9.82x | 9.92x | 10.47x |
|
|
506
|
+
|
|
507
|
+
Column guide for sequential same-file tables:
|
|
508
|
+
|
|
509
|
+
- `Declarations`: number of declarations walked in that file.
|
|
510
|
+
- `Scenarios`: number of checks performed. Here each declaration contributes two
|
|
511
|
+
scenarios: a partial declaration containing `sorry`, then the complete
|
|
512
|
+
declaration.
|
|
513
|
+
- `Lake growing-prefix total`: total terminal time for `lake env lean` on temp
|
|
514
|
+
prefix files containing header + already accepted prior declarations +
|
|
515
|
+
current scenario.
|
|
516
|
+
- `Lake full-file total`: total terminal time for `lake env lean` on temp full
|
|
517
|
+
files where only the current declaration is replaced by the scenario text.
|
|
518
|
+
- `Probe cached total`: total time for one LeanProbe/LeanInteract server
|
|
519
|
+
walking the same scenarios while reusing the same-file environment.
|
|
520
|
+
- `Probe fresh total`: total time for LeanProbe checks with a fresh
|
|
521
|
+
LeanProbe/LeanInteract server per scenario.
|
|
522
|
+
- `Speedup vs growing-prefix Lake`: `Lake growing-prefix total /
|
|
523
|
+
Probe cached total`.
|
|
524
|
+
- `Speedup vs full-file Lake`: `Lake full-file total / Probe cached total`.
|
|
525
|
+
- `Speedup vs fresh Probe`: `Probe fresh total / Probe cached total`;
|
|
526
|
+
this isolates the value of cache reuse within LeanProbe itself.
|
|
527
|
+
|
|
528
|
+
## Reproduce
|
|
529
|
+
|
|
530
|
+
Validate the standalone example files:
|
|
531
|
+
|
|
532
|
+
```bash
|
|
533
|
+
lake env lean /path/to/LeanProbe/examples/lean/analysis_real.lean
|
|
534
|
+
lake env lean /path/to/LeanProbe/examples/lean/algebra_order.lean
|
|
535
|
+
lake env lean /path/to/LeanProbe/examples/lean/sets_functions.lean
|
|
536
|
+
lake env lean /path/to/LeanProbe/examples/lean/number_theory_nat.lean
|
|
537
|
+
lake env lean /path/to/LeanProbe/examples/lean/tcs_binary_heap.lean
|
|
538
|
+
lake env lean /path/to/LeanProbe/examples/lean/tcs_treap_analysis.lean
|
|
539
|
+
lake env lean /path/to/LeanProbe/examples/lean/tcs_weighted_graph_prefix.lean
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
Run the target suite:
|
|
543
|
+
|
|
544
|
+
```bash
|
|
545
|
+
lean-probe benchmark-suite \
|
|
546
|
+
--cases-file examples/benchmark_cases.json \
|
|
547
|
+
--cwd /path/to/mathlib-lake-project \
|
|
548
|
+
--runs 1 --warmups 0 --include-feedback --include-no-cache \
|
|
549
|
+
--pretty
|
|
550
|
+
```
|
|
551
|
+
|
|
552
|
+
Run one sequential same-file benchmark. By default this includes terminal Lake
|
|
553
|
+
prefix checks, terminal Lake full-file checks, cached LeanProbe checks, and
|
|
554
|
+
no-cache LeanProbe checks:
|
|
555
|
+
|
|
556
|
+
```bash
|
|
557
|
+
lean-probe benchmark-file \
|
|
558
|
+
examples/lean/analysis_real.lean \
|
|
559
|
+
--cwd /path/to/mathlib-lake-project \
|
|
560
|
+
--runs 1 \
|
|
561
|
+
--pretty
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
To compare another verifier, pass it as a shell command. `{file}` is the temp
|
|
565
|
+
full candidate file for the current partial/full scenario:
|
|
566
|
+
|
|
567
|
+
```bash
|
|
568
|
+
lean-probe benchmark-file \
|
|
569
|
+
examples/lean/analysis_real.lean \
|
|
570
|
+
--cwd /path/to/mathlib-lake-project \
|
|
571
|
+
--runs 1 \
|
|
572
|
+
--external-command 'custom-verify=/path/to/verify-file.sh {file}' \
|
|
573
|
+
--pretty
|
|
574
|
+
```
|
|
575
|
+
|
|
576
|
+
MCP tools are usually not shell commands, so benchmark them through a small
|
|
577
|
+
adapter script that calls the MCP tool for `{file}`, exits nonzero on hard
|
|
578
|
+
failure, and prints a final JSON line with `success`, `ok`, `has_errors`, and
|
|
579
|
+
`has_sorry`.
|
|
580
|
+
|
|
581
|
+
Run Python tests:
|
|
582
|
+
|
|
583
|
+
```bash
|
|
584
|
+
python -m pytest -q
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
Run the optional real LeanInteract smoke test:
|
|
588
|
+
|
|
589
|
+
```bash
|
|
590
|
+
LEAN_PROBE_RUN_INTEGRATION=1 python -m pytest tests/test_integration.py -q
|
|
591
|
+
```
|
|
592
|
+
|
|
593
|
+
Validation for the May 13, 2026 numbers:
|
|
594
|
+
|
|
595
|
+
- every benchmark source file passed `lake env lean`;
|
|
596
|
+
- all compact and TCS repeated-target benchmark cases returned
|
|
597
|
+
`success=true`;
|
|
598
|
+
- all sequential same-file benchmark rows completed with matching Lake and
|
|
599
|
+
LeanProbe success status for the expected partial-sorry and full-declaration
|
|
600
|
+
scenarios;
|
|
601
|
+
- one intentionally broken replacement for `nat_mul_pos_bench` returned
|
|
602
|
+
`ok=false`, `has_errors=true`, a type-mismatch diagnostic, and non-empty
|
|
603
|
+
`feedback_lean`.
|
|
604
|
+
|
|
605
|
+
## Output Shape
|
|
606
|
+
|
|
607
|
+
`lean_probe_check` and `lean_probe_feedback` return JSON-compatible dictionaries:
|
|
608
|
+
|
|
609
|
+
- `success`: false for tool/project/backend failures;
|
|
610
|
+
- `ok`: true only when Lean accepts the target without `sorry`;
|
|
611
|
+
- `error_code`: stable machine-readable failure code when `success=false`;
|
|
612
|
+
- `timed_out`: true when the backend failure was classified as a timeout;
|
|
613
|
+
- `messages`: Lean diagnostics with both chunk-local and file-global positions;
|
|
614
|
+
- `tactics`: tactic text, ranges, goals, proof states, and used constants;
|
|
615
|
+
- `feedback_lean`: target declaration with inline feedback comments;
|
|
616
|
+
- `cache`: header/prior-declaration environment reuse metadata;
|
|
617
|
+
- `elapsed_s`: wall-clock time for the check.
|
|
618
|
+
|
|
619
|
+
Current `error_code` values include `no_project_root`, `file_not_found`,
|
|
620
|
+
`target_not_found`, `lean_interact_unavailable`, `lean_interact_start_failed`,
|
|
621
|
+
`header_failed`, `prior_decl_failed`, `dead_server`, `session_dead`,
|
|
622
|
+
`unknown_session`, `timeout`, and `backend_error`.
|
|
623
|
+
|
|
624
|
+
See [AGENT.md](AGENT.md) for the complete MCP output contract, including
|
|
625
|
+
`success` versus `ok`, proof-state stepping, and `feedback_lean`.
|
|
626
|
+
|
|
627
|
+
Declarations inside `mutual ... end` blocks are included as prior context for
|
|
628
|
+
later targets, but the individual declarations inside the mutual block are not
|
|
629
|
+
separate LeanProbe targets. If a requested target is found inside such a block,
|
|
630
|
+
LeanProbe returns `target_not_found` with a hint that explains the limitation.
|
|
631
|
+
|
|
632
|
+
## Backend Dependency
|
|
633
|
+
|
|
634
|
+
[LeanInteract](https://github.com/augustepoiroux/LeanInteract) is LeanProbe's
|
|
635
|
+
primary backend dependency. LeanInteract provides the Lean REPL process,
|
|
636
|
+
incremental elaboration, command responses, proof states, tactic stepping, and
|
|
637
|
+
the low-level interaction API.
|
|
638
|
+
|
|
639
|
+
LeanProbe builds on that backend with file segmentation, same-file declaration
|
|
640
|
+
targeting, warm prior environments, replacement checks, feedback annotation,
|
|
641
|
+
CLI commands, MCP tools, and reproducible benchmark harnesses.
|