levi-evolve 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. levi_evolve-0.1.0/.gitignore +220 -0
  2. levi_evolve-0.1.0/LICENSE +21 -0
  3. levi_evolve-0.1.0/PKG-INFO +203 -0
  4. levi_evolve-0.1.0/README.md +131 -0
  5. levi_evolve-0.1.0/levi/__init__.py +124 -0
  6. levi_evolve-0.1.0/levi/artifacts/__init__.py +7 -0
  7. levi_evolve-0.1.0/levi/artifacts/base.py +80 -0
  8. levi_evolve-0.1.0/levi/artifacts/code.py +231 -0
  9. levi_evolve-0.1.0/levi/artifacts/prompt.py +609 -0
  10. levi_evolve-0.1.0/levi/behavior/__init__.py +8 -0
  11. levi_evolve-0.1.0/levi/behavior/extractor.py +215 -0
  12. levi_evolve-0.1.0/levi/behavior/features.py +139 -0
  13. levi_evolve-0.1.0/levi/clients/__init__.py +14 -0
  14. levi_evolve-0.1.0/levi/clients/_cli_common.py +31 -0
  15. levi_evolve-0.1.0/levi/clients/base.py +45 -0
  16. levi_evolve-0.1.0/levi/clients/claude_code.py +122 -0
  17. levi_evolve-0.1.0/levi/clients/codex.py +135 -0
  18. levi_evolve-0.1.0/levi/clients/lm.py +225 -0
  19. levi_evolve-0.1.0/levi/config/__init__.py +35 -0
  20. levi_evolve-0.1.0/levi/config/models.py +260 -0
  21. levi_evolve-0.1.0/levi/core/__init__.py +12 -0
  22. levi_evolve-0.1.0/levi/core/evaluation.py +31 -0
  23. levi_evolve-0.1.0/levi/core/program.py +25 -0
  24. levi_evolve-0.1.0/levi/core/types.py +11 -0
  25. levi_evolve-0.1.0/levi/demos/__init__.py +15 -0
  26. levi_evolve-0.1.0/levi/demos/aime.py +121 -0
  27. levi_evolve-0.1.0/levi/demos/circle_packing.py +119 -0
  28. levi_evolve-0.1.0/levi/equilibrium/__init__.py +11 -0
  29. levi_evolve-0.1.0/levi/equilibrium/equilibrium.py +511 -0
  30. levi_evolve-0.1.0/levi/equilibrium/prompts.py +187 -0
  31. levi_evolve-0.1.0/levi/init/__init__.py +6 -0
  32. levi_evolve-0.1.0/levi/init/diversifier.py +888 -0
  33. levi_evolve-0.1.0/levi/init/proxy_benchmark.py +223 -0
  34. levi_evolve-0.1.0/levi/methods/__init__.py +9 -0
  35. levi_evolve-0.1.0/levi/methods/levi.py +568 -0
  36. levi_evolve-0.1.0/levi/pipeline/__init__.py +16 -0
  37. levi_evolve-0.1.0/levi/pipeline/consumer.py +300 -0
  38. levi_evolve-0.1.0/levi/pipeline/producer.py +157 -0
  39. levi_evolve-0.1.0/levi/pipeline/runner.py +432 -0
  40. levi_evolve-0.1.0/levi/pipeline/state.py +553 -0
  41. levi_evolve-0.1.0/levi/pool/__init__.py +10 -0
  42. levi_evolve-0.1.0/levi/pool/cvt_map_elites.py +772 -0
  43. levi_evolve-0.1.0/levi/pool/protocol.py +72 -0
  44. levi_evolve-0.1.0/levi/prompt_opt/__init__.py +5 -0
  45. levi_evolve-0.1.0/levi/prompt_opt/optimizer.py +597 -0
  46. levi_evolve-0.1.0/levi/prompts/__init__.py +19 -0
  47. levi_evolve-0.1.0/levi/prompts/builder.py +155 -0
  48. levi_evolve-0.1.0/levi/prompts/bundle.py +188 -0
  49. levi_evolve-0.1.0/levi/selection/__init__.py +17 -0
  50. levi_evolve-0.1.0/levi/selection/component.py +242 -0
  51. levi_evolve-0.1.0/levi/utils/__init__.py +19 -0
  52. levi_evolve-0.1.0/levi/utils/code_extraction.py +73 -0
  53. levi_evolve-0.1.0/levi/utils/evaluation.py +161 -0
  54. levi_evolve-0.1.0/levi/utils/ids.py +8 -0
  55. levi_evolve-0.1.0/levi/utils/preflight.py +77 -0
  56. levi_evolve-0.1.0/levi/utils/resilient_pool.py +165 -0
  57. levi_evolve-0.1.0/pyproject.toml +103 -0
@@ -0,0 +1,220 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # OS-generated files
30
+ .DS_Store
31
+ .AppleDouble
32
+ .LSOverride
33
+ ._*
34
+ Thumbs.db
35
+ Desktop.ini
36
+
37
+ # PyInstaller
38
+ # Usually these files are written by a python script from a template
39
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
40
+ *.manifest
41
+ *.spec
42
+
43
+ # Installer logs
44
+ pip-log.txt
45
+ pip-delete-this-directory.txt
46
+
47
+ # Unit test / coverage reports
48
+ htmlcov/
49
+ .tox/
50
+ .nox/
51
+ .coverage
52
+ .coverage.*
53
+ .cache
54
+ nosetests.xml
55
+ coverage.xml
56
+ *.cover
57
+ *.py.cover
58
+ .hypothesis/
59
+ .pytest_cache/
60
+ cover/
61
+
62
+ # Translations
63
+ *.mo
64
+ *.pot
65
+
66
+ # Django stuff:
67
+ *.log
68
+ local_settings.py
69
+ db.sqlite3
70
+ db.sqlite3-journal
71
+
72
+ # Flask stuff:
73
+ instance/
74
+ .webassets-cache
75
+
76
+ # Scrapy stuff:
77
+ .scrapy
78
+
79
+ # Sphinx documentation
80
+ docs/_build/
81
+
82
+ # PyBuilder
83
+ .pybuilder/
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # IPython
90
+ profile_default/
91
+ ipython_config.py
92
+
93
+ # pyenv
94
+ # For a library or package, you might want to ignore these files since the code is
95
+ # intended to run in multiple environments; otherwise, check them in:
96
+ # .python-version
97
+
98
+ # pipenv
99
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
101
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
102
+ # install all needed dependencies.
103
+ #Pipfile.lock
104
+
105
+ # UV
106
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
107
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
108
+ # commonly ignored for libraries.
109
+ #uv.lock
110
+
111
+ # poetry
112
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
113
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
114
+ # commonly ignored for libraries.
115
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
116
+ #poetry.lock
117
+ #poetry.toml
118
+
119
+ # pdm
120
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
121
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
122
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
123
+ #pdm.lock
124
+ #pdm.toml
125
+ .pdm-python
126
+ .pdm-build/
127
+
128
+ # pixi
129
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
130
+ #pixi.lock
131
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
132
+ # in the .venv directory. It is recommended not to include this directory in version control.
133
+ .pixi
134
+
135
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
136
+ __pypackages__/
137
+
138
+ # Celery stuff
139
+ celerybeat-schedule
140
+ celerybeat.pid
141
+
142
+ # SageMath parsed files
143
+ *.sage.py
144
+
145
+ # Environments
146
+ .env
147
+ .envrc
148
+ .venv
149
+ env/
150
+ venv/
151
+ ENV/
152
+ env.bak/
153
+ venv.bak/
154
+
155
+ # Spyder project settings
156
+ .spyderproject
157
+ .spyproject
158
+
159
+ # Rope project settings
160
+ .ropeproject
161
+
162
+ # mkdocs documentation
163
+ /site
164
+
165
+ # mypy
166
+ .mypy_cache/
167
+ .dmypy.json
168
+ dmypy.json
169
+
170
+ # Pyre type checker
171
+ .pyre/
172
+
173
+ # pytype static type analyzer
174
+ .pytype/
175
+
176
+ # Cython debug symbols
177
+ cython_debug/
178
+
179
+ # PyCharm
180
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
181
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
182
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
183
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
184
+ #.idea/
185
+
186
+ # Abstra
187
+ # Abstra is an AI-powered process automation framework.
188
+ # Ignore directories containing user credentials, local state, and settings.
189
+ # Learn more at https://abstra.io/docs
190
+ .abstra/
191
+
192
+ # Visual Studio Code
193
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
194
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
195
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
196
+ # you could uncomment the following to ignore the entire vscode folder
197
+ # .vscode/
198
+
199
+ # Ruff stuff:
200
+ .ruff_cache/
201
+
202
+ # PyPI configuration file
203
+ .pypirc
204
+
205
+ # Cursor
206
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
207
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
208
+ # refer to https://docs.cursor.com/context/ignore-files
209
+ .cursorignore
210
+ .cursorindexingignore
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Levi run data and traces
218
+ runs/
219
+ **/traces/
220
+ *.swp
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Temoor Tanveer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,203 @@
1
+ Metadata-Version: 2.4
2
+ Name: levi-evolve
3
+ Version: 0.1.0
4
+ Summary: Levi: Evolutionary optimization framework for algorithms and prompts
5
+ License: MIT License
6
+
7
+ Copyright (c) 2025 Temoor Tanveer
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+ License-File: LICENSE
27
+ Requires-Python: <3.13,>=3.11
28
+ Requires-Dist: colorama>=0.4.6
29
+ Requires-Dist: dspy>=3.1.2
30
+ Requires-Dist: litellm>=1.81.16
31
+ Requires-Dist: numpy>=2.3.5
32
+ Requires-Dist: scikit-learn>=1.4.0
33
+ Provides-Extra: demos
34
+ Requires-Dist: datasets>=3.0; extra == 'demos'
35
+ Provides-Extra: dev
36
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
37
+ Requires-Dist: ruff>=0.11.0; extra == 'dev'
38
+ Provides-Extra: example-cloudcast
39
+ Requires-Dist: networkx>=3.0; extra == 'example-cloudcast'
40
+ Provides-Extra: example-eplb
41
+ Requires-Dist: torch>=2.9.1; extra == 'example-eplb'
42
+ Provides-Extra: example-hover
43
+ Requires-Dist: bm25s>=0.3.6; extra == 'example-hover'
44
+ Requires-Dist: datasets<4; extra == 'example-hover'
45
+ Requires-Dist: pystemmer>=3.0.0; extra == 'example-hover'
46
+ Provides-Extra: example-ifbench
47
+ Requires-Dist: datasets>=3.0; extra == 'example-ifbench'
48
+ Requires-Dist: emoji>=2.0; extra == 'example-ifbench'
49
+ Requires-Dist: immutabledict>=4.0; extra == 'example-ifbench'
50
+ Requires-Dist: langdetect>=1.0.9; extra == 'example-ifbench'
51
+ Requires-Dist: litellm>=1.50; extra == 'example-ifbench'
52
+ Requires-Dist: nltk>=3.9; extra == 'example-ifbench'
53
+ Requires-Dist: packaging>=24.0; extra == 'example-ifbench'
54
+ Requires-Dist: spacy>=3.7; extra == 'example-ifbench'
55
+ Requires-Dist: syllapy>=0.7; extra == 'example-ifbench'
56
+ Provides-Extra: example-llm-sql
57
+ Requires-Dist: pandas>=2.3.3; extra == 'example-llm-sql'
58
+ Provides-Extra: examples
59
+ Requires-Dist: datasets>=3.0; extra == 'examples'
60
+ Requires-Dist: emoji>=2.0; extra == 'examples'
61
+ Requires-Dist: immutabledict>=4.0; extra == 'examples'
62
+ Requires-Dist: langdetect>=1.0.9; extra == 'examples'
63
+ Requires-Dist: litellm>=1.50; extra == 'examples'
64
+ Requires-Dist: networkx>=3.0; extra == 'examples'
65
+ Requires-Dist: nltk>=3.9; extra == 'examples'
66
+ Requires-Dist: packaging>=24.0; extra == 'examples'
67
+ Requires-Dist: pandas>=2.3.3; extra == 'examples'
68
+ Requires-Dist: spacy>=3.7; extra == 'examples'
69
+ Requires-Dist: syllapy>=0.7; extra == 'examples'
70
+ Requires-Dist: torch>=2.9.1; extra == 'examples'
71
+ Description-Content-Type: text/markdown
72
+
73
+ <p align="center">
74
+ <img src="assets/logos/levi_logo_dark.svg#gh-dark-mode-only" width="25%" alt="LEVI" />
75
+ <img src="assets/logos/levi_logo_light.svg#gh-light-mode-only" width="25%" alt="LEVI" />
76
+ </p>
77
+
78
+ <p align="center"><strong>AlphaEvolve Performance for a Fraction of the Cost</strong></p>
79
+
80
+ <p align="center">
81
+ <a href="https://github.com/ttanv/levi/actions/workflows/ci.yml?query=branch%3Amain"><img src="https://github.com/ttanv/levi/actions/workflows/ci.yml/badge.svg?branch=main" alt="CI"></a>
82
+ <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.11%2B-blue.svg" alt="Python 3.11+"></a>
83
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License: MIT"></a>
84
+ <a href="https://arxiv.org/abs/2605.09764"><img src="https://img.shields.io/badge/arXiv-2605.09764-b31b1b.svg" alt="arXiv"></a>
85
+ </p>
86
+
87
+ ---
88
+
89
+ LEVI is an LLM-guided evolutionary framework for **code** and **prompts**. Point it at a scoring function and a budget and LEVI evolves the artifact for you, using API models, a local server, or your Claude Code / Codex CLI subscription. **$5 with a local Qwen 30B improves on what other frameworks need $30 and Claude Opus to achieve** across a variety of problems, at a fraction of the cost.
90
+
91
+ ## Why LEVI
92
+
93
+ Existing frameworks couple performance tightly to model capability. Drop to a smaller model and results degrade sharply. LEVI decouples the two by making **diversity an architectural concern** rather than a model concern, and by matching model capacity to task demand.
94
+
95
+ Cheap models handle the bulk of mutation work. A behavioral archive keeps structurally different strategies alive, preventing premature convergence. Periodic paradigm shifts from a stronger model inject genuinely new ideas. The result: you spend less and get more.
96
+
97
+
98
+ <p align="center">
99
+ <img src="assets/plots/figure_front_page.png" width="100%" alt="LEVI vs baselines on code and prompt optimization" />
100
+ </p>
101
+ <p align="center"><em>LEVI on code optimization (exceeds every baseline's final score within 1/15th of the evaluations) and prompt optimization (outperforms GEPA at less than half the rollouts).</em></p>
102
+
103
+ ## Quickstart
104
+
105
+ ```bash
106
+ # Install uv first: https://docs.astral.sh/uv/getting-started/installation/
107
+ git clone https://github.com/ttanv/levi.git
108
+ cd levi
109
+ uv sync
110
+ ```
111
+
112
+ Pick whichever path matches what you have access to — each is a single self-contained file under [`examples/quickstart/`](examples/quickstart/) that runs in a couple of minutes:
113
+
114
+ | You have… | Run | Evolves | Costs you |
115
+ | ---------------------------------------- | --------------------------------------------------------- | ------- | ------------------------ |
116
+ | a Claude Code or Codex CLI subscription | `uv run python examples/quickstart/quickstart_claude.py` (or `quickstart_codex.py`) | code | $0 (subscription quota) |
117
+ | an API key (OpenAI / Anthropic / …) | `uv run python examples/quickstart/quickstart_api.py` | code | ~$0.05–0.10 |
118
+ | an API key, and you want to tune prompts | `uv run python examples/quickstart/quickstart_prompts.py` | prompts | ~$0.05–0.10 |
119
+
120
+ The CLI rows use your existing Claude or Codex subscription — no API key needed. For the API rows, set `OPENAI_API_KEY` (or change `MODEL` at the top of the file to another [litellm provider](https://docs.litellm.ai/docs/providers) and set the matching key) before running.
121
+
122
+ A minimal LEVI program looks like this:
123
+
124
+ ```python
125
+ import levi
126
+
127
+ result = levi.evolve_code(
128
+ "Place 16 non-overlapping circles in the unit square. Maximize sum of radii.",
129
+ function_signature="def run_packing() -> tuple[np.ndarray, np.ndarray, float]: ...",
130
+ score_fn=score_fn, # see levi.demos.circle_packing
131
+ model="openai/gpt-4o-mini",
132
+ budget_dollars=0.10,
133
+ )
134
+ print(result.best_score, result.best_program)
135
+ ```
136
+
137
+ See [`examples/quickstart/quickstart_api.py`](examples/quickstart/quickstart_api.py) for a runnable version. Output snapshots write to `./runs/<timestamp>/` relative to your CWD; override with `output_dir="path/to/dir"`.
138
+
139
+ ## Going further
140
+
141
+ - `examples/quickstart/` — the four single-file starters above (three code, one prompt). The three code starters evolve a mini n=16 circle-packing function; the prompt starter tunes a prompt for AIME math on a small Qwen.
142
+ - `examples/benchmarks/code/` — `evolve_code` at paper scale: circle packing (n=26, $15 budget, paradigm + mutation models) and the seven ADRS Leaderboard problems from the paper.
143
+ - `examples/benchmarks/prompts/` — `evolve_prompts` benchmarks comparing against GEPA: HotpotQA, HoVer, PUPA, IFBench.
144
+ - See [`examples/benchmarks/README.md`](examples/benchmarks/README.md) for datasets, keys, and per-problem setup.
145
+
146
+ ## Results
147
+
148
+ LEVI holds the **highest average score (76.5)** across all seven [ADRS Leaderboard](https://ucbskyadrs.github.io/) problems, ahead of GEPA (71.9), OpenEvolve (70.6), and ShinkaEvolve (67.4). Six of the seven problems were solved on a **$4.50 budget**.
149
+
150
+ | Problem | LEVI | Best Other Framework | Saving |
151
+ |---------|------|----------------------|--------|
152
+ | Spot Single-Reg | **51.7** | GEPA 51.4 | 6.7x cheaper |
153
+ | Spot Multi-Reg | **72.4** | OpenEvolve 66.7 | 5.6x cheaper |
154
+ | LLM-SQL | **78.3** | OpenEvolve 72.5 | 4.4x cheaper |
155
+ | Cloudcast | **100.0** | GEPA 96.6 | 3.3x cheaper |
156
+ | Prism | **87.4** | GEPA / OpenEvolve / ShinkaEvolve 87.4 | 3.3x cheaper |
157
+ | EPLB | **74.6** | GEPA 70.2 | 3.3x cheaper |
158
+ | Txn Scheduling | **71.1** | OpenEvolve 70.0 | 1.5x cheaper |
159
+
160
+ <p align="center">
161
+ <img src="assets/plots/circle_packing_best.png#gh-dark-mode-only" width="50%" alt="Circle Packing" />
162
+ <img src="assets/plots/circle_packing_best_light.png#gh-light-mode-only" width="50%" alt="Circle Packing" />
163
+ </p>
164
+
165
+ LEVI scored **2.6359+ packing density** on the n=26 circle packing benchmark, with a local model handling the majority of mutations. See [`examples/benchmarks/code/circle_packing`](examples/benchmarks/code/circle_packing) for the full setup.
166
+
167
+ For advanced routing, pass a `levi.LM(...)` directly:
168
+
169
+ ```python
170
+ local_qwen = levi.LM(
171
+ "Qwen/Qwen3-30B-A3B-Instruct-2507",
172
+ api_base="http://localhost:8000/v1",
173
+ api_key="unused",
174
+ input_cost_per_token=0.0000001,
175
+ output_cost_per_token=0.0000004,
176
+ )
177
+ ```
178
+
179
+ ## How It Works
180
+
181
+ 1. **Seed & score.** You provide a starting program and a scoring function. LEVI generates diverse variants to populate a behavioral archive.
182
+ 2. **Evolve.** Cheap models mutate and refine solutions in parallel. The behavioral archive keeps structurally different strategies alive, preventing convergence.
183
+ 3. **Paradigm shifts.** Periodically, a stronger model proposes entirely new algorithmic approaches based on the archive's best ideas.
184
+ 4. **Budget stops.** LEVI tracks spend in real time and stops when your dollar, evaluation, or time cap is hit.
185
+
186
+ ## Further Reading
187
+
188
+ - [LEVI: LLM-Guided Evolutionary Search Needs Better Harnesses, Not Bigger Models](https://ucbskyadrs.github.io/blog/levi/) — The full blog post on the ADRS site.
189
+
190
+ ## Citation
191
+
192
+ If you use LEVI in your research, please cite:
193
+
194
+ ```bibtex
195
+ @software{tanveer2026levi,
196
+ title = {LEVI: LLM-Guided Evolutionary Search Needs Better Harnesses, Not Bigger Models},
197
+ author = {Tanveer, Temoor},
198
+ url = {https://github.com/ttanv/levi},
199
+ year = {2026}
200
+ }
201
+ ```
202
+
203
+ Contact: ttanveer@alumni.cmu.edu
@@ -0,0 +1,131 @@
1
+ <p align="center">
2
+ <img src="assets/logos/levi_logo_dark.svg#gh-dark-mode-only" width="25%" alt="LEVI" />
3
+ <img src="assets/logos/levi_logo_light.svg#gh-light-mode-only" width="25%" alt="LEVI" />
4
+ </p>
5
+
6
+ <p align="center"><strong>AlphaEvolve Performance for a Fraction of the Cost</strong></p>
7
+
8
+ <p align="center">
9
+ <a href="https://github.com/ttanv/levi/actions/workflows/ci.yml?query=branch%3Amain"><img src="https://github.com/ttanv/levi/actions/workflows/ci.yml/badge.svg?branch=main" alt="CI"></a>
10
+ <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.11%2B-blue.svg" alt="Python 3.11+"></a>
11
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License: MIT"></a>
12
+ <a href="https://arxiv.org/abs/2605.09764"><img src="https://img.shields.io/badge/arXiv-2605.09764-b31b1b.svg" alt="arXiv"></a>
13
+ </p>
14
+
15
+ ---
16
+
17
+ LEVI is an LLM-guided evolutionary framework for **code** and **prompts**. Point it at a scoring function and a budget and LEVI evolves the artifact for you, using API models, a local server, or your Claude Code / Codex CLI subscription. **$5 with a local Qwen 30B improves on what other frameworks need $30 and Claude Opus to achieve** across a variety of problems, at a fraction of the cost.
18
+
19
+ ## Why LEVI
20
+
21
+ Existing frameworks couple performance tightly to model capability. Drop to a smaller model and results degrade sharply. LEVI decouples the two by making **diversity an architectural concern** rather than a model concern, and by matching model capacity to task demand.
22
+
23
+ Cheap models handle the bulk of mutation work. A behavioral archive keeps structurally different strategies alive, preventing premature convergence. Periodic paradigm shifts from a stronger model inject genuinely new ideas. The result: you spend less and get more.
24
+
25
+
26
+ <p align="center">
27
+ <img src="assets/plots/figure_front_page.png" width="100%" alt="LEVI vs baselines on code and prompt optimization" />
28
+ </p>
29
+ <p align="center"><em>LEVI on code optimization (exceeds every baseline's final score within 1/15th of the evaluations) and prompt optimization (outperforms GEPA at less than half the rollouts).</em></p>
30
+
31
+ ## Quickstart
32
+
33
+ ```bash
34
+ # Install uv first: https://docs.astral.sh/uv/getting-started/installation/
35
+ git clone https://github.com/ttanv/levi.git
36
+ cd levi
37
+ uv sync
38
+ ```
39
+
40
+ Pick whichever path matches what you have access to — each is a single self-contained file under [`examples/quickstart/`](examples/quickstart/) that runs in a couple of minutes:
41
+
42
+ | You have… | Run | Evolves | Costs you |
43
+ | ---------------------------------------- | --------------------------------------------------------- | ------- | ------------------------ |
44
+ | a Claude Code or Codex CLI subscription | `uv run python examples/quickstart/quickstart_claude.py` (or `quickstart_codex.py`) | code | $0 (subscription quota) |
45
+ | an API key (OpenAI / Anthropic / …) | `uv run python examples/quickstart/quickstart_api.py` | code | ~$0.05–0.10 |
46
+ | an API key, and you want to tune prompts | `uv run python examples/quickstart/quickstart_prompts.py` | prompts | ~$0.05–0.10 |
47
+
48
+ The CLI rows use your existing Claude or Codex subscription — no API key needed. For the API rows, set `OPENAI_API_KEY` (or change `MODEL` at the top of the file to another [litellm provider](https://docs.litellm.ai/docs/providers) and set the matching key) before running.
49
+
50
+ A minimal LEVI program looks like this:
51
+
52
+ ```python
53
+ import levi
54
+
55
+ result = levi.evolve_code(
56
+ "Place 16 non-overlapping circles in the unit square. Maximize sum of radii.",
57
+ function_signature="def run_packing() -> tuple[np.ndarray, np.ndarray, float]: ...",
58
+ score_fn=score_fn, # see levi.demos.circle_packing
59
+ model="openai/gpt-4o-mini",
60
+ budget_dollars=0.10,
61
+ )
62
+ print(result.best_score, result.best_program)
63
+ ```
64
+
65
+ See [`examples/quickstart/quickstart_api.py`](examples/quickstart/quickstart_api.py) for a runnable version. Output snapshots write to `./runs/<timestamp>/` relative to your CWD; override with `output_dir="path/to/dir"`.
66
+
67
+ ## Going further
68
+
69
+ - `examples/quickstart/` — the four single-file starters above (three code, one prompt). The three code starters evolve a mini n=16 circle-packing function; the prompt starter tunes a prompt for AIME math on a small Qwen.
70
+ - `examples/benchmarks/code/` — `evolve_code` at paper scale: circle packing (n=26, $15 budget, paradigm + mutation models) and the seven ADRS Leaderboard problems from the paper.
71
+ - `examples/benchmarks/prompts/` — `evolve_prompts` benchmarks comparing against GEPA: HotpotQA, HoVer, PUPA, IFBench.
72
+ - See [`examples/benchmarks/README.md`](examples/benchmarks/README.md) for datasets, keys, and per-problem setup.
73
+
74
+ ## Results
75
+
76
+ LEVI holds the **highest average score (76.5)** across all seven [ADRS Leaderboard](https://ucbskyadrs.github.io/) problems, ahead of GEPA (71.9), OpenEvolve (70.6), and ShinkaEvolve (67.4). Six of the seven problems were solved on a **$4.50 budget**.
77
+
78
+ | Problem | LEVI | Best Other Framework | Saving |
79
+ |---------|------|----------------------|--------|
80
+ | Spot Single-Reg | **51.7** | GEPA 51.4 | 6.7x cheaper |
81
+ | Spot Multi-Reg | **72.4** | OpenEvolve 66.7 | 5.6x cheaper |
82
+ | LLM-SQL | **78.3** | OpenEvolve 72.5 | 4.4x cheaper |
83
+ | Cloudcast | **100.0** | GEPA 96.6 | 3.3x cheaper |
84
+ | Prism | **87.4** | GEPA / OpenEvolve / ShinkaEvolve 87.4 | 3.3x cheaper |
85
+ | EPLB | **74.6** | GEPA 70.2 | 3.3x cheaper |
86
+ | Txn Scheduling | **71.1** | OpenEvolve 70.0 | 1.5x cheaper |
87
+
88
+ <p align="center">
89
+ <img src="assets/plots/circle_packing_best.png#gh-dark-mode-only" width="50%" alt="Circle Packing" />
90
+ <img src="assets/plots/circle_packing_best_light.png#gh-light-mode-only" width="50%" alt="Circle Packing" />
91
+ </p>
92
+
93
+ LEVI scored **2.6359+ packing density** on the n=26 circle packing benchmark, with a local model handling the majority of mutations. See [`examples/benchmarks/code/circle_packing`](examples/benchmarks/code/circle_packing) for the full setup.
94
+
95
+ For advanced routing, pass a `levi.LM(...)` directly:
96
+
97
+ ```python
98
+ local_qwen = levi.LM(
99
+ "Qwen/Qwen3-30B-A3B-Instruct-2507",
100
+ api_base="http://localhost:8000/v1",
101
+ api_key="unused",
102
+ input_cost_per_token=0.0000001,
103
+ output_cost_per_token=0.0000004,
104
+ )
105
+ ```
106
+
107
+ ## How It Works
108
+
109
+ 1. **Seed & score.** You provide a starting program and a scoring function. LEVI generates diverse variants to populate a behavioral archive.
110
+ 2. **Evolve.** Cheap models mutate and refine solutions in parallel. The behavioral archive keeps structurally different strategies alive, preventing convergence.
111
+ 3. **Paradigm shifts.** Periodically, a stronger model proposes entirely new algorithmic approaches based on the archive's best ideas.
112
+ 4. **Budget stops.** LEVI tracks spend in real time and stops when your dollar, evaluation, or time cap is hit.
113
+
114
+ ## Further Reading
115
+
116
+ - [LEVI: LLM-Guided Evolutionary Search Needs Better Harnesses, Not Bigger Models](https://ucbskyadrs.github.io/blog/levi/) — The full blog post on the ADRS site.
117
+
118
+ ## Citation
119
+
120
+ If you use LEVI in your research, please cite:
121
+
122
+ ```bibtex
123
+ @software{tanveer2026levi,
124
+ title = {LEVI: LLM-Guided Evolutionary Search Needs Better Harnesses, Not Bigger Models},
125
+ author = {Tanveer, Temoor},
126
+ url = {https://github.com/ttanv/levi},
127
+ year = {2026}
128
+ }
129
+ ```
130
+
131
+ Contact: ttanveer@alumni.cmu.edu