autoloop-ai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autoloop_ai-0.1.0/LICENSE +21 -0
- autoloop_ai-0.1.0/PKG-INFO +294 -0
- autoloop_ai-0.1.0/README.md +266 -0
- autoloop_ai-0.1.0/autoloop/__init__.py +24 -0
- autoloop_ai-0.1.0/autoloop/backends.py +226 -0
- autoloop_ai-0.1.0/autoloop/cli.py +69 -0
- autoloop_ai-0.1.0/autoloop/core.py +211 -0
- autoloop_ai-0.1.0/autoloop/metrics.py +92 -0
- autoloop_ai-0.1.0/autoloop_ai.egg-info/PKG-INFO +294 -0
- autoloop_ai-0.1.0/autoloop_ai.egg-info/SOURCES.txt +15 -0
- autoloop_ai-0.1.0/autoloop_ai.egg-info/dependency_links.txt +1 -0
- autoloop_ai-0.1.0/autoloop_ai.egg-info/entry_points.txt +2 -0
- autoloop_ai-0.1.0/autoloop_ai.egg-info/requires.txt +7 -0
- autoloop_ai-0.1.0/autoloop_ai.egg-info/top_level.txt +1 -0
- autoloop_ai-0.1.0/pyproject.toml +44 -0
- autoloop_ai-0.1.0/setup.cfg +4 -0
- autoloop_ai-0.1.0/tests/test_core.py +137 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Prahlad Menon
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autoloop-ai
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: autoresearch for everything β autonomous iterative improvement for any system
|
|
5
|
+
Author-email: Prahlad Menon <prahlad.menon@quant.md>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/menonpg/autoloop
|
|
8
|
+
Project-URL: Repository, https://github.com/menonpg/autoloop
|
|
9
|
+
Project-URL: Issues, https://github.com/menonpg/autoloop/issues
|
|
10
|
+
Keywords: ai,agents,optimization,autoresearch,prompt-optimization,autonomous
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: click>=8.0
|
|
23
|
+
Provides-Extra: ollama
|
|
24
|
+
Requires-Dist: requests>=2.31; extra == "ollama"
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: requests>=2.31; extra == "all"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# autoloop π
|
|
30
|
+
|
|
31
|
+
**autoresearch for everything.**
|
|
32
|
+
|
|
33
|
+
Karpathy's [autoresearch](https://github.com/karpathy/autoresearch) showed us the loop: point an AI agent at a problem, give it a metric, let it run 100 experiments overnight. Wake up to a better system.
|
|
34
|
+
|
|
35
|
+
That loop was hardcoded to ML training. **autoloop generalizes it to any domain.**
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from autoloop import AutoLoop
|
|
39
|
+
|
|
40
|
+
loop = AutoLoop(
|
|
41
|
+
target="optimize.py", # what the agent edits
|
|
42
|
+
metric=my_eval_function, # returns a float
|
|
43
|
+
directives="program.md", # research goals in plain English
|
|
44
|
+
budget_seconds=300, # per experiment (default: 5 min)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
loop.run(experiments=100) # go to sleep
|
|
48
|
+
# wake up to a git log of 100 experiments and a better system
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## It Works β Here's a Real Test Run
|
|
52
|
+
|
|
53
|
+
We ran autoloop on a naive recursive fibonacci function, giving it 4 experiments to find a faster implementation. No human involved after the initial setup:
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
π Baseline score: -0.1717s (naive recursion, fibonacci(30))
|
|
57
|
+
|
|
58
|
+
π¬ Experiment 1/4
|
|
59
|
+
β
KEPT | Score: -0.0249 (+0.1467) | Add memoization with dict cache
|
|
60
|
+
|
|
61
|
+
π¬ Experiment 2/4
|
|
62
|
+
β DISCARDED | Score: -0.0280 (-0.0030) | Switch to iterative approach
|
|
63
|
+
|
|
64
|
+
π¬ Experiment 3/4
|
|
65
|
+
β DISCARDED | Score: -999.000 (-998.97) | Wrong shortcut β should be discarded
|
|
66
|
+
|
|
67
|
+
π¬ Experiment 4/4
|
|
68
|
+
β
KEPT | Score: -0.0217 (+0.0032) | Use functools.lru_cache decorator
|
|
69
|
+
|
|
70
|
+
π Run complete: 4 experiments | 2 improvements | Best: -0.0217s
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
**6.9x speedup from baseline.** Broken code (exp 3) was automatically detected and discarded via the correctness check in the metric. The loop kept every genuine improvement and rejected everything else.
|
|
74
|
+
|
|
75
|
+
## Why This Exists
|
|
76
|
+
|
|
77
|
+
autoresearch works because of three design decisions:
|
|
78
|
+
1. **Single file to modify** β keeps scope manageable, diffs reviewable
|
|
79
|
+
2. **Fixed time/compute budget** β makes experiments directly comparable
|
|
80
|
+
3. **One unambiguous metric** β enables full autonomy, no human judgment needed
|
|
81
|
+
|
|
82
|
+
These decisions aren't specific to ML training. They apply to any system you want to improve autonomously. autoloop is just the abstraction.
|
|
83
|
+
|
|
84
|
+
## What You Can Optimize
|
|
85
|
+
|
|
86
|
+
| Domain | Target file | Metric |
|
|
87
|
+
|--------|-------------|--------|
|
|
88
|
+
| **Prompt optimization** | `prompt.md` | LLM-as-judge score / task accuracy |
|
|
89
|
+
| **SQL queries** | `query.sql` | Execution time / rows returned |
|
|
90
|
+
| **Trading strategies** | `strategy.py` | Sharpe ratio / win rate |
|
|
91
|
+
| **API pipelines** | `pipeline.py` | Latency / success rate |
|
|
92
|
+
| **Test suites** | `tests.py` | Coverage / mutation score |
|
|
93
|
+
| **Compiler flags** | `build.sh` | Binary size / compile time |
|
|
94
|
+
| **Agent system prompts** | `system_prompt.md` | Task completion rate |
|
|
95
|
+
| **RAG pipelines** | `retrieval.py` | RAGAS score / hit rate |
|
|
96
|
+
|
|
97
|
+
## Install
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install autoloop
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Requires Python 3.10+. Works with any LLM agent backend (Claude Code, Codex, local models via Ollama).
|
|
104
|
+
|
|
105
|
+
## Quickstart
|
|
106
|
+
|
|
107
|
+
### 1. Define your target
|
|
108
|
+
|
|
109
|
+
The file your agent will edit. Start small β one function, one prompt, one query.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
# optimize.py β your agent edits this
|
|
113
|
+
SYSTEM_PROMPT = """You are a helpful assistant."""
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### 2. Define your metric
|
|
117
|
+
|
|
118
|
+
A Python function that returns a float. Lower or higher = better (you configure which).
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
def my_metric(target_path: str) -> float:
|
|
122
|
+
"""Run eval and return score. autoloop calls this after every experiment."""
|
|
123
|
+
result = run_eval(target_path)
|
|
124
|
+
return result.accuracy # higher is better
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### 3. Write your directives
|
|
128
|
+
|
|
129
|
+
Plain English research goals in `program.md`. This is what you iterate on over time.
|
|
130
|
+
|
|
131
|
+
```markdown
|
|
132
|
+
# Research Directives
|
|
133
|
+
|
|
134
|
+
## Goal
|
|
135
|
+
Improve the system prompt to increase task completion rate on customer support queries.
|
|
136
|
+
|
|
137
|
+
## Hypotheses to explore
|
|
138
|
+
- More specific role definition
|
|
139
|
+
- Explicit handling of edge cases
|
|
140
|
+
- Chain-of-thought instructions
|
|
141
|
+
- Tone adjustments for different query types
|
|
142
|
+
|
|
143
|
+
## Constraints
|
|
144
|
+
- Keep under 500 tokens
|
|
145
|
+
- Must pass safety checks
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### 4. Run
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from autoloop import AutoLoop
|
|
152
|
+
|
|
153
|
+
loop = AutoLoop(
|
|
154
|
+
target="optimize.py",
|
|
155
|
+
metric=my_metric,
|
|
156
|
+
directives="program.md",
|
|
157
|
+
budget_seconds=300,
|
|
158
|
+
agent="claude", # "claude", "codex", "ollama"
|
|
159
|
+
higher_is_better=True,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
loop.run(experiments=100)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### 5. Review
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
autoloop history # git log of all experiments
|
|
169
|
+
autoloop best # show the best-performing version
|
|
170
|
+
autoloop diff 12 best # compare experiment 12 to best
|
|
171
|
+
autoloop rollback 12 # restore experiment 12
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## How It Works
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
178
|
+
β autoloop β
|
|
179
|
+
β β
|
|
180
|
+
β Read directives.md β
|
|
181
|
+
β β β
|
|
182
|
+
β βΌ β
|
|
183
|
+
β Agent proposes modification to target file β
|
|
184
|
+
β β β
|
|
185
|
+
β βΌ β
|
|
186
|
+
β Apply modification β
|
|
187
|
+
β β β
|
|
188
|
+
β βΌ β
|
|
189
|
+
β Run metric() with fixed budget β
|
|
190
|
+
β β β
|
|
191
|
+
β βΌ β
|
|
192
|
+
β Score improved? ββYESβββΆ git commit + update best β
|
|
193
|
+
β β β
|
|
194
|
+
β NO β
|
|
195
|
+
β β β
|
|
196
|
+
β βΌ β
|
|
197
|
+
β Discard + log β
|
|
198
|
+
β β β
|
|
199
|
+
β βΌ β
|
|
200
|
+
β Repeat N times β
|
|
201
|
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Each experiment is logged with: timestamp, modification description, score delta, and the full diff. The git history is your research log.
|
|
205
|
+
|
|
206
|
+
## Advanced Usage
|
|
207
|
+
|
|
208
|
+
### Parallel experiments
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
loop.run(experiments=100, parallel=4) # 4 agents running simultaneously
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Custom agent backends
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from autoloop.backends import OllamaBackend
|
|
218
|
+
|
|
219
|
+
loop = AutoLoop(
|
|
220
|
+
target="prompt.md",
|
|
221
|
+
metric=my_metric,
|
|
222
|
+
directives="program.md",
|
|
223
|
+
backend=OllamaBackend(model="llama3.1:70b"),
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Warm starts
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
# Resume from a previous run's best result
|
|
231
|
+
loop.run(experiments=50, warm_start="./autoloop-results/best.py")
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Metric composition
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from autoloop import CompositeMetric
|
|
238
|
+
|
|
239
|
+
metric = CompositeMetric([
|
|
240
|
+
(accuracy_metric, 0.7), # 70% weight
|
|
241
|
+
(latency_metric, 0.3), # 30% weight
|
|
242
|
+
])
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
## Examples
|
|
246
|
+
|
|
247
|
+
- [`examples/prompt_optimization/`](examples/prompt_optimization/) β optimize a Claude system prompt for customer support
|
|
248
|
+
- [`examples/sql_optimization/`](examples/sql_optimization/) β optimize a slow SQL query
|
|
249
|
+
- [`examples/trading_strategy/`](examples/trading_strategy/) β evolve a trading strategy (inspired by AutoStrategy)
|
|
250
|
+
- [`examples/rag_pipeline/`](examples/rag_pipeline/) β optimize a RAG retrieval pipeline
|
|
251
|
+
|
|
252
|
+
## Comparison to autoresearch
|
|
253
|
+
|
|
254
|
+
| | autoresearch | autoloop |
|
|
255
|
+
|--|--|--|
|
|
256
|
+
| Domain | ML training only | Any |
|
|
257
|
+
| Target | `train.py` | Any file |
|
|
258
|
+
| Metric | `val_bpb` | Any Python function |
|
|
259
|
+
| Budget | 5-min wall clock | Configurable |
|
|
260
|
+
| Agent | Claude Code / Codex | Any |
|
|
261
|
+
| Parallel | No | Yes |
|
|
262
|
+
|
|
263
|
+
autoloop is autoresearch with the ML-specific parts removed and replaced with a general interface.
|
|
264
|
+
|
|
265
|
+
## Philosophy
|
|
266
|
+
|
|
267
|
+
The insight from autoresearch isn't about ML. It's about loop design:
|
|
268
|
+
|
|
269
|
+
1. **Unambiguous feedback** β the metric must be objective and quantitative
|
|
270
|
+
2. **Fixed budget** β experiments must be comparable
|
|
271
|
+
3. **Narrow scope** β one file, reviewable diffs
|
|
272
|
+
4. **Overnight scale** β 100 experiments while you sleep
|
|
273
|
+
|
|
274
|
+
Wherever you can satisfy these four conditions, you can run autonomous improvement. autoloop makes that loop accessible without writing the scaffolding yourself.
|
|
275
|
+
|
|
276
|
+
## Roadmap
|
|
277
|
+
|
|
278
|
+
- [ ] Web UI for experiment visualization
|
|
279
|
+
- [ ] Multi-file optimization with dependency tracking
|
|
280
|
+
- [ ] MCP server (use autoloop as a tool inside Claude Code)
|
|
281
|
+
- [ ] Hosted experiment tracking (autoloop cloud)
|
|
282
|
+
- [ ] Pre-built metric libraries (RAGAS, finance, code quality)
|
|
283
|
+
|
|
284
|
+
## Contributing
|
|
285
|
+
|
|
286
|
+
PRs welcome. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
MIT
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
*Inspired by [karpathy/autoresearch](https://github.com/karpathy/autoresearch). autoloop generalizes the loop.*
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# autoloop π
|
|
2
|
+
|
|
3
|
+
**autoresearch for everything.**
|
|
4
|
+
|
|
5
|
+
Karpathy's [autoresearch](https://github.com/karpathy/autoresearch) showed us the loop: point an AI agent at a problem, give it a metric, let it run 100 experiments overnight. Wake up to a better system.
|
|
6
|
+
|
|
7
|
+
That loop was hardcoded to ML training. **autoloop generalizes it to any domain.**
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from autoloop import AutoLoop
|
|
11
|
+
|
|
12
|
+
loop = AutoLoop(
|
|
13
|
+
target="optimize.py", # what the agent edits
|
|
14
|
+
metric=my_eval_function, # returns a float
|
|
15
|
+
directives="program.md", # research goals in plain English
|
|
16
|
+
budget_seconds=300, # per experiment (default: 5 min)
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
loop.run(experiments=100) # go to sleep
|
|
20
|
+
# wake up to a git log of 100 experiments and a better system
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## It Works β Here's a Real Test Run
|
|
24
|
+
|
|
25
|
+
We ran autoloop on a naive recursive fibonacci function, giving it 4 experiments to find a faster implementation. No human involved after the initial setup:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
π Baseline score: -0.1717s (naive recursion, fibonacci(30))
|
|
29
|
+
|
|
30
|
+
π¬ Experiment 1/4
|
|
31
|
+
β
KEPT | Score: -0.0249 (+0.1467) | Add memoization with dict cache
|
|
32
|
+
|
|
33
|
+
π¬ Experiment 2/4
|
|
34
|
+
β DISCARDED | Score: -0.0280 (-0.0030) | Switch to iterative approach
|
|
35
|
+
|
|
36
|
+
π¬ Experiment 3/4
|
|
37
|
+
β DISCARDED | Score: -999.000 (-998.97) | Wrong shortcut β should be discarded
|
|
38
|
+
|
|
39
|
+
π¬ Experiment 4/4
|
|
40
|
+
β
KEPT | Score: -0.0217 (+0.0032) | Use functools.lru_cache decorator
|
|
41
|
+
|
|
42
|
+
π Run complete: 4 experiments | 2 improvements | Best: -0.0217s
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**6.9x speedup from baseline.** Broken code (exp 3) was automatically detected and discarded via the correctness check in the metric. The loop kept every genuine improvement and rejected everything else.
|
|
46
|
+
|
|
47
|
+
## Why This Exists
|
|
48
|
+
|
|
49
|
+
autoresearch works because of three design decisions:
|
|
50
|
+
1. **Single file to modify** β keeps scope manageable, diffs reviewable
|
|
51
|
+
2. **Fixed time/compute budget** β makes experiments directly comparable
|
|
52
|
+
3. **One unambiguous metric** β enables full autonomy, no human judgment needed
|
|
53
|
+
|
|
54
|
+
These decisions aren't specific to ML training. They apply to any system you want to improve autonomously. autoloop is just the abstraction.
|
|
55
|
+
|
|
56
|
+
## What You Can Optimize
|
|
57
|
+
|
|
58
|
+
| Domain | Target file | Metric |
|
|
59
|
+
|--------|-------------|--------|
|
|
60
|
+
| **Prompt optimization** | `prompt.md` | LLM-as-judge score / task accuracy |
|
|
61
|
+
| **SQL queries** | `query.sql` | Execution time / rows returned |
|
|
62
|
+
| **Trading strategies** | `strategy.py` | Sharpe ratio / win rate |
|
|
63
|
+
| **API pipelines** | `pipeline.py` | Latency / success rate |
|
|
64
|
+
| **Test suites** | `tests.py` | Coverage / mutation score |
|
|
65
|
+
| **Compiler flags** | `build.sh` | Binary size / compile time |
|
|
66
|
+
| **Agent system prompts** | `system_prompt.md` | Task completion rate |
|
|
67
|
+
| **RAG pipelines** | `retrieval.py` | RAGAS score / hit rate |
|
|
68
|
+
|
|
69
|
+
## Install
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install autoloop
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Requires Python 3.10+. Works with any LLM agent backend (Claude Code, Codex, local models via Ollama).
|
|
76
|
+
|
|
77
|
+
## Quickstart
|
|
78
|
+
|
|
79
|
+
### 1. Define your target
|
|
80
|
+
|
|
81
|
+
The file your agent will edit. Start small β one function, one prompt, one query.
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# optimize.py β your agent edits this
|
|
85
|
+
SYSTEM_PROMPT = """You are a helpful assistant."""
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### 2. Define your metric
|
|
89
|
+
|
|
90
|
+
A Python function that returns a float. Lower or higher = better (you configure which).
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
def my_metric(target_path: str) -> float:
|
|
94
|
+
"""Run eval and return score. autoloop calls this after every experiment."""
|
|
95
|
+
result = run_eval(target_path)
|
|
96
|
+
return result.accuracy # higher is better
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### 3. Write your directives
|
|
100
|
+
|
|
101
|
+
Plain English research goals in `program.md`. This is what you iterate on over time.
|
|
102
|
+
|
|
103
|
+
```markdown
|
|
104
|
+
# Research Directives
|
|
105
|
+
|
|
106
|
+
## Goal
|
|
107
|
+
Improve the system prompt to increase task completion rate on customer support queries.
|
|
108
|
+
|
|
109
|
+
## Hypotheses to explore
|
|
110
|
+
- More specific role definition
|
|
111
|
+
- Explicit handling of edge cases
|
|
112
|
+
- Chain-of-thought instructions
|
|
113
|
+
- Tone adjustments for different query types
|
|
114
|
+
|
|
115
|
+
## Constraints
|
|
116
|
+
- Keep under 500 tokens
|
|
117
|
+
- Must pass safety checks
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### 4. Run
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from autoloop import AutoLoop
|
|
124
|
+
|
|
125
|
+
loop = AutoLoop(
|
|
126
|
+
target="optimize.py",
|
|
127
|
+
metric=my_metric,
|
|
128
|
+
directives="program.md",
|
|
129
|
+
budget_seconds=300,
|
|
130
|
+
agent="claude", # "claude", "codex", "ollama"
|
|
131
|
+
higher_is_better=True,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
loop.run(experiments=100)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### 5. Review
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
autoloop history # git log of all experiments
|
|
141
|
+
autoloop best # show the best-performing version
|
|
142
|
+
autoloop diff 12 best # compare experiment 12 to best
|
|
143
|
+
autoloop rollback 12 # restore experiment 12
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## How It Works
|
|
147
|
+
|
|
148
|
+
```
|
|
149
|
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
150
|
+
β autoloop β
|
|
151
|
+
β β
|
|
152
|
+
β Read directives.md β
|
|
153
|
+
β β β
|
|
154
|
+
β βΌ β
|
|
155
|
+
β Agent proposes modification to target file β
|
|
156
|
+
β β β
|
|
157
|
+
β βΌ β
|
|
158
|
+
β Apply modification β
|
|
159
|
+
β β β
|
|
160
|
+
β βΌ β
|
|
161
|
+
β Run metric() with fixed budget β
|
|
162
|
+
β β β
|
|
163
|
+
β βΌ β
|
|
164
|
+
β Score improved? ββYESβββΆ git commit + update best β
|
|
165
|
+
β β β
|
|
166
|
+
β NO β
|
|
167
|
+
β β β
|
|
168
|
+
β βΌ β
|
|
169
|
+
β Discard + log β
|
|
170
|
+
β β β
|
|
171
|
+
β βΌ β
|
|
172
|
+
β Repeat N times β
|
|
173
|
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Each experiment is logged with: timestamp, modification description, score delta, and the full diff. The git history is your research log.
|
|
177
|
+
|
|
178
|
+
## Advanced Usage
|
|
179
|
+
|
|
180
|
+
### Parallel experiments
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
loop.run(experiments=100, parallel=4) # 4 agents running simultaneously
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Custom agent backends
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from autoloop.backends import OllamaBackend
|
|
190
|
+
|
|
191
|
+
loop = AutoLoop(
|
|
192
|
+
target="prompt.md",
|
|
193
|
+
metric=my_metric,
|
|
194
|
+
directives="program.md",
|
|
195
|
+
backend=OllamaBackend(model="llama3.1:70b"),
|
|
196
|
+
)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Warm starts
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
# Resume from a previous run's best result
|
|
203
|
+
loop.run(experiments=50, warm_start="./autoloop-results/best.py")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Metric composition
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from autoloop import CompositeMetric
|
|
210
|
+
|
|
211
|
+
metric = CompositeMetric([
|
|
212
|
+
(accuracy_metric, 0.7), # 70% weight
|
|
213
|
+
(latency_metric, 0.3), # 30% weight
|
|
214
|
+
])
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Examples
|
|
218
|
+
|
|
219
|
+
- [`examples/prompt_optimization/`](examples/prompt_optimization/) β optimize a Claude system prompt for customer support
|
|
220
|
+
- [`examples/sql_optimization/`](examples/sql_optimization/) β optimize a slow SQL query
|
|
221
|
+
- [`examples/trading_strategy/`](examples/trading_strategy/) β evolve a trading strategy (inspired by AutoStrategy)
|
|
222
|
+
- [`examples/rag_pipeline/`](examples/rag_pipeline/) β optimize a RAG retrieval pipeline
|
|
223
|
+
|
|
224
|
+
## Comparison to autoresearch
|
|
225
|
+
|
|
226
|
+
| | autoresearch | autoloop |
|
|
227
|
+
|--|--|--|
|
|
228
|
+
| Domain | ML training only | Any |
|
|
229
|
+
| Target | `train.py` | Any file |
|
|
230
|
+
| Metric | `val_bpb` | Any Python function |
|
|
231
|
+
| Budget | 5-min wall clock | Configurable |
|
|
232
|
+
| Agent | Claude Code / Codex | Any |
|
|
233
|
+
| Parallel | No | Yes |
|
|
234
|
+
|
|
235
|
+
autoloop is autoresearch with the ML-specific parts removed and replaced with a general interface.
|
|
236
|
+
|
|
237
|
+
## Philosophy
|
|
238
|
+
|
|
239
|
+
The insight from autoresearch isn't about ML. It's about loop design:
|
|
240
|
+
|
|
241
|
+
1. **Unambiguous feedback** β the metric must be objective and quantitative
|
|
242
|
+
2. **Fixed budget** β experiments must be comparable
|
|
243
|
+
3. **Narrow scope** β one file, reviewable diffs
|
|
244
|
+
4. **Overnight scale** β 100 experiments while you sleep
|
|
245
|
+
|
|
246
|
+
Wherever you can satisfy these four conditions, you can run autonomous improvement. autoloop makes that loop accessible without writing the scaffolding yourself.
|
|
247
|
+
|
|
248
|
+
## Roadmap
|
|
249
|
+
|
|
250
|
+
- [ ] Web UI for experiment visualization
|
|
251
|
+
- [ ] Multi-file optimization with dependency tracking
|
|
252
|
+
- [ ] MCP server (use autoloop as a tool inside Claude Code)
|
|
253
|
+
- [ ] Hosted experiment tracking (autoloop cloud)
|
|
254
|
+
- [ ] Pre-built metric libraries (RAGAS, finance, code quality)
|
|
255
|
+
|
|
256
|
+
## Contributing
|
|
257
|
+
|
|
258
|
+
PRs welcome. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
259
|
+
|
|
260
|
+
## License
|
|
261
|
+
|
|
262
|
+
MIT
|
|
263
|
+
|
|
264
|
+
---
|
|
265
|
+
|
|
266
|
+
*Inspired by [karpathy/autoresearch](https://github.com/karpathy/autoresearch). autoloop generalizes the loop.*
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
autoloop β autoresearch for everything.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from autoloop.core import AutoLoop
|
|
6
|
+
from autoloop.metrics import CompositeMetric
|
|
7
|
+
from autoloop.backends import (
|
|
8
|
+
AnthropicBackend,
|
|
9
|
+
OpenAIBackend,
|
|
10
|
+
OllamaBackend,
|
|
11
|
+
ClaudeBackend,
|
|
12
|
+
CodexBackend,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
16
|
+
__all__ = [
|
|
17
|
+
"AutoLoop",
|
|
18
|
+
"CompositeMetric",
|
|
19
|
+
"AnthropicBackend",
|
|
20
|
+
"OpenAIBackend",
|
|
21
|
+
"OllamaBackend",
|
|
22
|
+
"ClaudeBackend",
|
|
23
|
+
"CodexBackend",
|
|
24
|
+
]
|