hypara 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hypara-0.1.0/LICENSE +21 -0
- hypara-0.1.0/MANIFEST.in +9 -0
- hypara-0.1.0/PKG-INFO +264 -0
- hypara-0.1.0/README.md +235 -0
- hypara-0.1.0/configs/full.json +17 -0
- hypara-0.1.0/configs/smoke.json +11 -0
- hypara-0.1.0/optimizers/hill_climb/main.py +161 -0
- hypara-0.1.0/optimizers/hill_climb/manifest.json +4 -0
- hypara-0.1.0/optimizers/random_search/main.py +73 -0
- hypara-0.1.0/optimizers/random_search/manifest.json +4 -0
- hypara-0.1.0/pyproject.toml +51 -0
- hypara-0.1.0/setup.cfg +4 -0
- hypara-0.1.0/src/hypara/__init__.py +4 -0
- hypara-0.1.0/src/hypara/cli.py +192 -0
- hypara-0.1.0/src/hypara/event_log.py +44 -0
- hypara-0.1.0/src/hypara/metrics.py +153 -0
- hypara-0.1.0/src/hypara/problems/__init__.py +0 -0
- hypara-0.1.0/src/hypara/problems/_util.py +30 -0
- hypara-0.1.0/src/hypara/problems/base.py +62 -0
- hypara-0.1.0/src/hypara/problems/conditional_knobs.py +91 -0
- hypara-0.1.0/src/hypara/problems/cost_aware.py +58 -0
- hypara-0.1.0/src/hypara/problems/dispatch_policy.py +86 -0
- hypara-0.1.0/src/hypara/problems/image_pipeline.py +82 -0
- hypara-0.1.0/src/hypara/problems/multi_fidelity.py +70 -0
- hypara-0.1.0/src/hypara/problems/noisy_lab.py +52 -0
- hypara-0.1.0/src/hypara/problems/rag_pipeline.py +90 -0
- hypara-0.1.0/src/hypara/problems/rugged_trap.py +54 -0
- hypara-0.1.0/src/hypara/problems/smooth_hill.py +43 -0
- hypara-0.1.0/src/hypara/problems/sparse_needle.py +59 -0
- hypara-0.1.0/src/hypara/protocol.py +92 -0
- hypara-0.1.0/src/hypara/registry.py +42 -0
- hypara-0.1.0/src/hypara/runner.py +248 -0
- hypara-0.1.0/src/hypara/space.py +224 -0
- hypara-0.1.0/src/hypara/transport.py +116 -0
- hypara-0.1.0/src/hypara.egg-info/PKG-INFO +264 -0
- hypara-0.1.0/src/hypara.egg-info/SOURCES.txt +43 -0
- hypara-0.1.0/src/hypara.egg-info/dependency_links.txt +1 -0
- hypara-0.1.0/src/hypara.egg-info/entry_points.txt +2 -0
- hypara-0.1.0/src/hypara.egg-info/requires.txt +5 -0
- hypara-0.1.0/src/hypara.egg-info/top_level.txt +1 -0
- hypara-0.1.0/tests/test_problems.py +76 -0
- hypara-0.1.0/tests/test_runner.py +144 -0
- hypara-0.1.0/tests/test_space.py +100 -0
- hypara-0.1.0/tests/test_stage2.py +106 -0
- hypara-0.1.0/tests/test_transport.py +98 -0
hypara-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 jun76
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
hypara-0.1.0/MANIFEST.in
ADDED
hypara-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hypara
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Benchmark harness for black-box optimizers that speak an ask/tell JSON Lines protocol
|
|
5
|
+
Author-email: jun76 <jun76.main@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jun76/hypara
|
|
8
|
+
Project-URL: Repository, https://github.com/jun76/hypara
|
|
9
|
+
Project-URL: Issues, https://github.com/jun76/hypara/issues
|
|
10
|
+
Keywords: benchmark,optimization,black-box,hyperparameter-tuning,ask-tell,llm
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
26
|
+
Requires-Dist: build>=1; extra == "dev"
|
|
27
|
+
Requires-Dist: twine>=5; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# hypara
|
|
31
|
+
|
|
32
|
+
A benchmark harness for measuring how well an optimizer searches an **unknown
|
|
33
|
+
black-box evaluation function**.
|
|
34
|
+
|
|
35
|
+
hypara is deliberately not about solving famous problems (TSP, knapsack, bin
|
|
36
|
+
packing) where a strong off-the-shelf solver wins. Each problem ships a
|
|
37
|
+
natural-language description, a mixed search space, and a *hidden* evaluator
|
|
38
|
+
whose shape changes with the instance seed. To score well an optimizer has to
|
|
39
|
+
read the description, reason about the space, and adapt its strategy from the
|
|
40
|
+
evaluation history within a limited budget.
|
|
41
|
+
|
|
42
|
+
Optimizers are **language-agnostic external processes**: they talk to the
|
|
43
|
+
runner over a stdin/stdout JSON Lines protocol, so an optimizer can be written
|
|
44
|
+
in Python, Rust, Go, TypeScript, or any executable.
|
|
45
|
+
|
|
46
|
+
## Install
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install hypara
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For development (tests + build tooling):
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install -e .[dev]
|
|
56
|
+
python -m pytest
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Quickstart
|
|
60
|
+
|
|
61
|
+
List the built-in problems:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
hypara list
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Write a minimal optimizer. Create `my_opt/manifest.json`:
|
|
68
|
+
|
|
69
|
+
```json
|
|
70
|
+
{"name": "my_opt", "command": ["python", "main.py"]}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
and `my_opt/main.py`:
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import json, random, sys
|
|
77
|
+
|
|
78
|
+
space = []
|
|
79
|
+
rng = random.Random()
|
|
80
|
+
|
|
81
|
+
def send(msg):
|
|
82
|
+
sys.stdout.write(json.dumps(msg) + "\n")
|
|
83
|
+
sys.stdout.flush()
|
|
84
|
+
|
|
85
|
+
for line in sys.stdin:
|
|
86
|
+
msg = json.loads(line)
|
|
87
|
+
t = msg.get("type")
|
|
88
|
+
if t == "init":
|
|
89
|
+
space = msg["problem"]["space"]
|
|
90
|
+
rng = random.Random(msg.get("optimizer_seed"))
|
|
91
|
+
send({"type": "ready"})
|
|
92
|
+
elif t == "ask":
|
|
93
|
+
# propose a candidate; here, a trivial random pick over numeric params
|
|
94
|
+
cand = {}
|
|
95
|
+
for p in space:
|
|
96
|
+
if p.get("condition") is not None:
|
|
97
|
+
continue
|
|
98
|
+
if p["type"] == "categorical":
|
|
99
|
+
cand[p["name"]] = rng.choice(p["choices"])
|
|
100
|
+
elif p["type"] == "bool":
|
|
101
|
+
cand[p["name"]] = rng.random() < 0.5
|
|
102
|
+
else:
|
|
103
|
+
lo, hi = p["low"], p["high"]
|
|
104
|
+
v = rng.uniform(lo, hi)
|
|
105
|
+
cand[p["name"]] = int(round(v)) if p["type"] == "int" else v
|
|
106
|
+
send({"type": "propose", "candidate": cand})
|
|
107
|
+
elif t == "tell":
|
|
108
|
+
pass # inspect msg["score"], msg["valid"], msg["remaining"] to adapt
|
|
109
|
+
elif t == "finish":
|
|
110
|
+
break
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Run it against one problem, then aggregate:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
hypara run --problem smooth_hill --optimizer ./my_opt --seed 1
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
The source repository also includes two reference optimizers
|
|
120
|
+
(`optimizers/random_search`, `optimizers/hill_climb`) and ready-made suite
|
|
121
|
+
configs (`configs/smoke.json`, `configs/full.json`):
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
hypara suite --config configs/smoke.json
|
|
125
|
+
hypara report --dir results/smoke-YYYYmmdd-HHMMSS
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Built-in problems
|
|
129
|
+
|
|
130
|
+
All problems are single-objective, maximize, with an achievable maximum near
|
|
131
|
+
1.0. The hidden landscape is reseeded per run, so memorizing an instance does
|
|
132
|
+
not help.
|
|
133
|
+
|
|
134
|
+
| Problem | What it tests |
|
|
135
|
+
|---|---|
|
|
136
|
+
| `smooth_hill` | Smooth unimodal surface; local search should win. |
|
|
137
|
+
| `rugged_trap` | Multimodal with a decoy hill; needs restarts / exploration. |
|
|
138
|
+
| `conditional_knobs` | A categorical choice switches which knobs exist. |
|
|
139
|
+
| `noisy_lab` | Additive gaussian noise; beware chasing lucky readings. |
|
|
140
|
+
| `multi_fidelity` | Cheap biased low-fidelity vs. expensive true high-fidelity. |
|
|
141
|
+
| `sparse_needle` | One hidden combination scores high; weak partial-match signal. |
|
|
142
|
+
| `cost_aware` | The candidate's own `samples` knob drives its evaluation cost. |
|
|
143
|
+
| `rag_pipeline` | Surrogate RAG tuning (chunking, top_k, reranker interactions). |
|
|
144
|
+
| `image_pipeline` | Surrogate diffusion tuning; steps drive quality and cost. |
|
|
145
|
+
| `dispatch_policy` | Surrogate delivery policy; balance, batching, mild noise. |
|
|
146
|
+
|
|
147
|
+
## Protocol
|
|
148
|
+
|
|
149
|
+
The runner launches the optimizer as a child process (working directory = the
|
|
150
|
+
optimizer's directory; if `command[0]` is `"python"` it is replaced with the
|
|
151
|
+
runner's own interpreter). Messages are one JSON object per line: runner →
|
|
152
|
+
optimizer on stdin, optimizer → runner on stdout. **Optimizer stdout is
|
|
153
|
+
protocol-only; write debug output to stderr** (the runner saves it to
|
|
154
|
+
`optimizer.stderr.log`). Receivers ignore unknown keys. `NaN`/`Infinity` must
|
|
155
|
+
not be sent. Current `protocol_version` is `1`.
|
|
156
|
+
|
|
157
|
+
### Messages and turn-taking
|
|
158
|
+
|
|
159
|
+
| Direction | `type` | Reply |
|
|
160
|
+
|---|---|---|
|
|
161
|
+
| runner → optimizer | `init` | `ready` (once) |
|
|
162
|
+
| runner → optimizer | `ask` | `propose` (once) |
|
|
163
|
+
| runner → optimizer | `tell` | none |
|
|
164
|
+
| runner → optimizer | `finish` | none; exit promptly |
|
|
165
|
+
|
|
166
|
+
Only one `ask` is outstanding at a time. The `init` reply may take up to 30s,
|
|
167
|
+
each `ask` reply up to 60s by default; overruns end the run as
|
|
168
|
+
`optimizer_timeout`. A crash, an unparseable line, or an out-of-order message
|
|
169
|
+
ends the run as `failed`. The best-so-far is recorded in every case.
|
|
170
|
+
|
|
171
|
+
**init** (runner → optimizer):
|
|
172
|
+
|
|
173
|
+
```json
|
|
174
|
+
{"type": "init", "protocol_version": 1, "run_id": "smooth_hill--my_opt--s1",
|
|
175
|
+
"problem": {
|
|
176
|
+
"description": "natural-language prompt",
|
|
177
|
+
"space": [ ...param specs (below)... ],
|
|
178
|
+
"objective": "maximize",
|
|
179
|
+
"budget": {"evaluations": 100, "cost_limit": null, "time_limit_sec": 300.0},
|
|
180
|
+
"fidelities": null
|
|
181
|
+
},
|
|
182
|
+
"optimizer_seed": 12345}
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
`budget` always has at least one of `evaluations` or `cost_limit` non-null.
|
|
186
|
+
`fidelities`, when non-null, is ordered low→high (last entry = top fidelity).
|
|
187
|
+
|
|
188
|
+
**ready / propose** (optimizer → runner):
|
|
189
|
+
|
|
190
|
+
```json
|
|
191
|
+
{"type": "ready"}
|
|
192
|
+
{"type": "propose", "candidate": {"x0": 0.5, "algo": "alpha"}, "fidelity": "low"}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
`fidelity` is optional; omitted/null means top fidelity. Sending a non-null
|
|
196
|
+
`fidelity` to a problem with no fidelities is invalid.
|
|
197
|
+
|
|
198
|
+
**tell** (runner → optimizer):
|
|
199
|
+
|
|
200
|
+
```json
|
|
201
|
+
{"type": "tell", "candidate_id": "c-0007", "candidate": {"x0": 0.5},
|
|
202
|
+
"valid": true, "score": 0.73, "cost": 1.0, "fidelity": null, "error": null,
|
|
203
|
+
"remaining": {"evaluations": 92, "cost": null, "time_sec": 291.3}}
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
When invalid: `valid: false`, `score: null`, and `error` gives the reason.
|
|
207
|
+
|
|
208
|
+
**finish** (runner → optimizer): `{"type": "finish", "reason": "budget_exhausted"}`
|
|
209
|
+
(`reason` is `budget_exhausted` or `time_limit`).
|
|
210
|
+
|
|
211
|
+
### Search space
|
|
212
|
+
|
|
213
|
+
```json
|
|
214
|
+
[
|
|
215
|
+
{"name": "lr", "type": "float", "low": 1e-4, "high": 1.0, "log": true},
|
|
216
|
+
{"name": "layers", "type": "int", "low": 1, "high": 12},
|
|
217
|
+
{"name": "opt", "type": "categorical", "choices": ["sgd", "adam"]},
|
|
218
|
+
{"name": "warmup", "type": "bool"},
|
|
219
|
+
{"name": "warmup_steps", "type": "int", "low": 10, "high": 1000,
|
|
220
|
+
"condition": {"param": "warmup", "equals": [true]}}
|
|
221
|
+
]
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
- Types: `float`, `int`, `categorical`, `bool`. Bounds `low`/`high` are
|
|
225
|
+
inclusive; `log: true` hints a log scale.
|
|
226
|
+
- A param with `condition` is **active** only when
|
|
227
|
+
`candidate[condition.param]` is in `equals`. Conditioning is one level deep
|
|
228
|
+
(the parent must be unconditional).
|
|
229
|
+
|
|
230
|
+
A candidate is validated by the runner: it must be a JSON object containing
|
|
231
|
+
**exactly** the active params (no unknown keys, no inactive params, none
|
|
232
|
+
missing), each of the right type and within range.
|
|
233
|
+
|
|
234
|
+
### Budget rules
|
|
235
|
+
|
|
236
|
+
- A valid evaluation consumes the evaluator's `cost` (may depend on the
|
|
237
|
+
candidate/fidelity); the `evaluations` axis always consumes 1.
|
|
238
|
+
- **An invalid proposal still consumes budget** (1 evaluation, cost 1.0), so
|
|
239
|
+
spamming invalid candidates cannot mine the space for free.
|
|
240
|
+
- The stop check runs before each `ask`, so the final evaluation may slightly
|
|
241
|
+
overshoot `cost_limit`.
|
|
242
|
+
- For problems with `fidelities`, **only top-fidelity evaluations count toward
|
|
243
|
+
`best_score`**; lower fidelities are available as history but not scored.
|
|
244
|
+
|
|
245
|
+
## Metrics
|
|
246
|
+
|
|
247
|
+
`hypara report` recomputes everything from the saved logs. Per run: best
|
|
248
|
+
score, best candidate, best-so-far curve (over evaluations or cumulative
|
|
249
|
+
cost), valid rate, status, wall time. Aggregated per (problem, optimizer):
|
|
250
|
+
mean best, a baseline-relative normalized best and normalized anytime AUC
|
|
251
|
+
(0 = baseline median, 1 = best observed for that problem), and an overall
|
|
252
|
+
mean across problems.
|
|
253
|
+
|
|
254
|
+
## Adding a problem
|
|
255
|
+
|
|
256
|
+
Implement `Problem` under `src/hypara/problems/` and register it in
|
|
257
|
+
`src/hypara/registry.py`. Keep the description and the evaluator's actual
|
|
258
|
+
behavior in sync — the point of the benchmark is that reading the description
|
|
259
|
+
helps. The shared invariants in `tests/test_problems.py` (finite scores,
|
|
260
|
+
determinism given a seed, instance-seed sensitivity) apply automatically.
|
|
261
|
+
|
|
262
|
+
## License
|
|
263
|
+
|
|
264
|
+
MIT. See [LICENSE](LICENSE).
|
hypara-0.1.0/README.md
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# hypara
|
|
2
|
+
|
|
3
|
+
A benchmark harness for measuring how well an optimizer searches an **unknown
|
|
4
|
+
black-box evaluation function**.
|
|
5
|
+
|
|
6
|
+
hypara is deliberately not about solving famous problems (TSP, knapsack, bin
|
|
7
|
+
packing) where a strong off-the-shelf solver wins. Each problem ships a
|
|
8
|
+
natural-language description, a mixed search space, and a *hidden* evaluator
|
|
9
|
+
whose shape changes with the instance seed. To score well an optimizer has to
|
|
10
|
+
read the description, reason about the space, and adapt its strategy from the
|
|
11
|
+
evaluation history within a limited budget.
|
|
12
|
+
|
|
13
|
+
Optimizers are **language-agnostic external processes**: they talk to the
|
|
14
|
+
runner over a stdin/stdout JSON Lines protocol, so an optimizer can be written
|
|
15
|
+
in Python, Rust, Go, TypeScript, or any executable.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install hypara
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
For development (tests + build tooling):
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install -e .[dev]
|
|
27
|
+
python -m pytest
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quickstart
|
|
31
|
+
|
|
32
|
+
List the built-in problems:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
hypara list
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Write a minimal optimizer. Create `my_opt/manifest.json`:
|
|
39
|
+
|
|
40
|
+
```json
|
|
41
|
+
{"name": "my_opt", "command": ["python", "main.py"]}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
and `my_opt/main.py`:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import json, random, sys
|
|
48
|
+
|
|
49
|
+
space = []
|
|
50
|
+
rng = random.Random()
|
|
51
|
+
|
|
52
|
+
def send(msg):
|
|
53
|
+
sys.stdout.write(json.dumps(msg) + "\n")
|
|
54
|
+
sys.stdout.flush()
|
|
55
|
+
|
|
56
|
+
for line in sys.stdin:
|
|
57
|
+
msg = json.loads(line)
|
|
58
|
+
t = msg.get("type")
|
|
59
|
+
if t == "init":
|
|
60
|
+
space = msg["problem"]["space"]
|
|
61
|
+
rng = random.Random(msg.get("optimizer_seed"))
|
|
62
|
+
send({"type": "ready"})
|
|
63
|
+
elif t == "ask":
|
|
64
|
+
# propose a candidate; here, a trivial random pick over numeric params
|
|
65
|
+
cand = {}
|
|
66
|
+
for p in space:
|
|
67
|
+
if p.get("condition") is not None:
|
|
68
|
+
continue
|
|
69
|
+
if p["type"] == "categorical":
|
|
70
|
+
cand[p["name"]] = rng.choice(p["choices"])
|
|
71
|
+
elif p["type"] == "bool":
|
|
72
|
+
cand[p["name"]] = rng.random() < 0.5
|
|
73
|
+
else:
|
|
74
|
+
lo, hi = p["low"], p["high"]
|
|
75
|
+
v = rng.uniform(lo, hi)
|
|
76
|
+
cand[p["name"]] = int(round(v)) if p["type"] == "int" else v
|
|
77
|
+
send({"type": "propose", "candidate": cand})
|
|
78
|
+
elif t == "tell":
|
|
79
|
+
pass # inspect msg["score"], msg["valid"], msg["remaining"] to adapt
|
|
80
|
+
elif t == "finish":
|
|
81
|
+
break
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Run it against one problem, then aggregate:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
hypara run --problem smooth_hill --optimizer ./my_opt --seed 1
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
The source repository also includes two reference optimizers
|
|
91
|
+
(`optimizers/random_search`, `optimizers/hill_climb`) and ready-made suite
|
|
92
|
+
configs (`configs/smoke.json`, `configs/full.json`):
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
hypara suite --config configs/smoke.json
|
|
96
|
+
hypara report --dir results/smoke-YYYYmmdd-HHMMSS
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Built-in problems
|
|
100
|
+
|
|
101
|
+
All problems are single-objective, maximize, with an achievable maximum near
|
|
102
|
+
1.0. The hidden landscape is reseeded per run, so memorizing an instance does
|
|
103
|
+
not help.
|
|
104
|
+
|
|
105
|
+
| Problem | What it tests |
|
|
106
|
+
|---|---|
|
|
107
|
+
| `smooth_hill` | Smooth unimodal surface; local search should win. |
|
|
108
|
+
| `rugged_trap` | Multimodal with a decoy hill; needs restarts / exploration. |
|
|
109
|
+
| `conditional_knobs` | A categorical choice switches which knobs exist. |
|
|
110
|
+
| `noisy_lab` | Additive gaussian noise; beware chasing lucky readings. |
|
|
111
|
+
| `multi_fidelity` | Cheap biased low-fidelity vs. expensive true high-fidelity. |
|
|
112
|
+
| `sparse_needle` | One hidden combination scores high; weak partial-match signal. |
|
|
113
|
+
| `cost_aware` | The candidate's own `samples` knob drives its evaluation cost. |
|
|
114
|
+
| `rag_pipeline` | Surrogate RAG tuning (chunking, top_k, reranker interactions). |
|
|
115
|
+
| `image_pipeline` | Surrogate diffusion tuning; steps drive quality and cost. |
|
|
116
|
+
| `dispatch_policy` | Surrogate delivery policy; balance, batching, mild noise. |
|
|
117
|
+
|
|
118
|
+
## Protocol
|
|
119
|
+
|
|
120
|
+
The runner launches the optimizer as a child process (working directory = the
|
|
121
|
+
optimizer's directory; if `command[0]` is `"python"` it is replaced with the
|
|
122
|
+
runner's own interpreter). Messages are one JSON object per line: runner →
|
|
123
|
+
optimizer on stdin, optimizer → runner on stdout. **Optimizer stdout is
|
|
124
|
+
protocol-only; write debug output to stderr** (the runner saves it to
|
|
125
|
+
`optimizer.stderr.log`). Receivers ignore unknown keys. `NaN`/`Infinity` must
|
|
126
|
+
not be sent. Current `protocol_version` is `1`.
|
|
127
|
+
|
|
128
|
+
### Messages and turn-taking
|
|
129
|
+
|
|
130
|
+
| Direction | `type` | Reply |
|
|
131
|
+
|---|---|---|
|
|
132
|
+
| runner → optimizer | `init` | `ready` (once) |
|
|
133
|
+
| runner → optimizer | `ask` | `propose` (once) |
|
|
134
|
+
| runner → optimizer | `tell` | none |
|
|
135
|
+
| runner → optimizer | `finish` | none; exit promptly |
|
|
136
|
+
|
|
137
|
+
Only one `ask` is outstanding at a time. The `init` reply may take up to 30s,
|
|
138
|
+
each `ask` reply up to 60s by default; overruns end the run as
|
|
139
|
+
`optimizer_timeout`. A crash, an unparseable line, or an out-of-order message
|
|
140
|
+
ends the run as `failed`. The best-so-far is recorded in every case.
|
|
141
|
+
|
|
142
|
+
**init** (runner → optimizer):
|
|
143
|
+
|
|
144
|
+
```json
|
|
145
|
+
{"type": "init", "protocol_version": 1, "run_id": "smooth_hill--my_opt--s1",
|
|
146
|
+
"problem": {
|
|
147
|
+
"description": "natural-language prompt",
|
|
148
|
+
"space": [ ...param specs (below)... ],
|
|
149
|
+
"objective": "maximize",
|
|
150
|
+
"budget": {"evaluations": 100, "cost_limit": null, "time_limit_sec": 300.0},
|
|
151
|
+
"fidelities": null
|
|
152
|
+
},
|
|
153
|
+
"optimizer_seed": 12345}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
`budget` always has at least one of `evaluations` or `cost_limit` non-null.
|
|
157
|
+
`fidelities`, when non-null, is ordered low→high (last entry = top fidelity).
|
|
158
|
+
|
|
159
|
+
**ready / propose** (optimizer → runner):
|
|
160
|
+
|
|
161
|
+
```json
|
|
162
|
+
{"type": "ready"}
|
|
163
|
+
{"type": "propose", "candidate": {"x0": 0.5, "algo": "alpha"}, "fidelity": "low"}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
`fidelity` is optional; omitted/null means top fidelity. Sending a non-null
|
|
167
|
+
`fidelity` to a problem with no fidelities is invalid.
|
|
168
|
+
|
|
169
|
+
**tell** (runner → optimizer):
|
|
170
|
+
|
|
171
|
+
```json
|
|
172
|
+
{"type": "tell", "candidate_id": "c-0007", "candidate": {"x0": 0.5},
|
|
173
|
+
"valid": true, "score": 0.73, "cost": 1.0, "fidelity": null, "error": null,
|
|
174
|
+
"remaining": {"evaluations": 92, "cost": null, "time_sec": 291.3}}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
When invalid: `valid: false`, `score: null`, and `error` gives the reason.
|
|
178
|
+
|
|
179
|
+
**finish** (runner → optimizer): `{"type": "finish", "reason": "budget_exhausted"}`
|
|
180
|
+
(`reason` is `budget_exhausted` or `time_limit`).
|
|
181
|
+
|
|
182
|
+
### Search space
|
|
183
|
+
|
|
184
|
+
```json
|
|
185
|
+
[
|
|
186
|
+
{"name": "lr", "type": "float", "low": 1e-4, "high": 1.0, "log": true},
|
|
187
|
+
{"name": "layers", "type": "int", "low": 1, "high": 12},
|
|
188
|
+
{"name": "opt", "type": "categorical", "choices": ["sgd", "adam"]},
|
|
189
|
+
{"name": "warmup", "type": "bool"},
|
|
190
|
+
{"name": "warmup_steps", "type": "int", "low": 10, "high": 1000,
|
|
191
|
+
"condition": {"param": "warmup", "equals": [true]}}
|
|
192
|
+
]
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
- Types: `float`, `int`, `categorical`, `bool`. Bounds `low`/`high` are
|
|
196
|
+
inclusive; `log: true` hints a log scale.
|
|
197
|
+
- A param with `condition` is **active** only when
|
|
198
|
+
`candidate[condition.param]` is in `equals`. Conditioning is one level deep
|
|
199
|
+
(the parent must be unconditional).
|
|
200
|
+
|
|
201
|
+
A candidate is validated by the runner: it must be a JSON object containing
|
|
202
|
+
**exactly** the active params (no unknown keys, no inactive params, none
|
|
203
|
+
missing), each of the right type and within range.
|
|
204
|
+
|
|
205
|
+
### Budget rules
|
|
206
|
+
|
|
207
|
+
- A valid evaluation consumes the evaluator's `cost` (may depend on the
|
|
208
|
+
candidate/fidelity); the `evaluations` axis always consumes 1.
|
|
209
|
+
- **An invalid proposal still consumes budget** (1 evaluation, cost 1.0), so
|
|
210
|
+
spamming invalid candidates cannot mine the space for free.
|
|
211
|
+
- The stop check runs before each `ask`, so the final evaluation may slightly
|
|
212
|
+
overshoot `cost_limit`.
|
|
213
|
+
- For problems with `fidelities`, **only top-fidelity evaluations count toward
|
|
214
|
+
`best_score`**; lower fidelities are available as history but not scored.
|
|
215
|
+
|
|
216
|
+
## Metrics
|
|
217
|
+
|
|
218
|
+
`hypara report` recomputes everything from the saved logs. Per run: best
|
|
219
|
+
score, best candidate, best-so-far curve (over evaluations or cumulative
|
|
220
|
+
cost), valid rate, status, wall time. Aggregated per (problem, optimizer):
|
|
221
|
+
mean best, a baseline-relative normalized best and normalized anytime AUC
|
|
222
|
+
(0 = baseline median, 1 = best observed for that problem), and an overall
|
|
223
|
+
mean across problems.
|
|
224
|
+
|
|
225
|
+
## Adding a problem
|
|
226
|
+
|
|
227
|
+
Implement `Problem` under `src/hypara/problems/` and register it in
|
|
228
|
+
`src/hypara/registry.py`. Keep the description and the evaluator's actual
|
|
229
|
+
behavior in sync — the point of the benchmark is that reading the description
|
|
230
|
+
helps. The shared invariants in `tests/test_problems.py` (finite scores,
|
|
231
|
+
determinism given a seed, instance-seed sensitivity) apply automatically.
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "full",
|
|
3
|
+
"problems": [
|
|
4
|
+
"smooth_hill",
|
|
5
|
+
"rugged_trap",
|
|
6
|
+
"conditional_knobs",
|
|
7
|
+
"noisy_lab",
|
|
8
|
+
"multi_fidelity",
|
|
9
|
+
"sparse_needle",
|
|
10
|
+
"cost_aware",
|
|
11
|
+
"rag_pipeline",
|
|
12
|
+
"image_pipeline",
|
|
13
|
+
"dispatch_policy"
|
|
14
|
+
],
|
|
15
|
+
"optimizers": ["optimizers/random_search", "optimizers/hill_climb"],
|
|
16
|
+
"seeds": [1, 2, 3, 4, 5, 6, 7, 8]
|
|
17
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "smoke",
|
|
3
|
+
"problems": ["smooth_hill", "rugged_trap", "conditional_knobs"],
|
|
4
|
+
"optimizers": ["optimizers/random_search", "optimizers/hill_climb"],
|
|
5
|
+
"seeds": [1, 2, 3],
|
|
6
|
+
"budget_overrides": {
|
|
7
|
+
"smooth_hill": {"evaluations": 40},
|
|
8
|
+
"rugged_trap": {"evaluations": 40},
|
|
9
|
+
"conditional_knobs": {"evaluations": 40}
|
|
10
|
+
}
|
|
11
|
+
}
|