evolvers 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evolvers/__init__.py +8 -0
- evolvers/criterion.py +183 -0
- evolvers/evolvable.py +497 -0
- evolvers/llm.py +168 -0
- evolvers-0.1.0.dist-info/METADATA +289 -0
- evolvers-0.1.0.dist-info/RECORD +9 -0
- evolvers-0.1.0.dist-info/WHEEL +4 -0
- evolvers-0.1.0.dist-info/licenses/LICENSE +202 -0
- evolvers-0.1.0.dist-info/licenses/NOTICE +5 -0
evolvers/__init__.py
ADDED
evolvers/criterion.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Criterion: judge (LLM-as-judge) and code (Python function) rubrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
import inspect
|
|
7
|
+
import re
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Literal
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
Kind = Literal["judge", "code"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _JudgeResponse(BaseModel):
|
|
18
|
+
score: float = Field(ge=-1.0, le=1.0)
|
|
19
|
+
reasoning: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Criterion:
|
|
24
|
+
"""One rubric. Either a natural-language judge or a Python function."""
|
|
25
|
+
|
|
26
|
+
name: str
|
|
27
|
+
kind: Kind
|
|
28
|
+
weight: float = 1.0
|
|
29
|
+
question: str | None = None # for kind="judge"
|
|
30
|
+
fn: Callable[..., float] | None = None # for kind="code"
|
|
31
|
+
source_code: str | None = None # captured for kind="code", to enable save/load
|
|
32
|
+
|
|
33
|
+
def __post_init__(self) -> None:
|
|
34
|
+
if self.kind == "judge" and not self.question:
|
|
35
|
+
raise ValueError(f"judge criterion {self.name!r} needs a `question`")
|
|
36
|
+
if self.kind == "code" and self.fn is None:
|
|
37
|
+
raise ValueError(f"code criterion {self.name!r} needs a `fn`")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def judge(question: str, *, name: str | None = None, weight: float = 1.0) -> Criterion:
|
|
41
|
+
"""Define an LLM-as-judge criterion from a natural-language question.
|
|
42
|
+
|
|
43
|
+
The judge scores in [-1, 1]. -1 = fails entirely; +1 = perfectly satisfies.
|
|
44
|
+
"""
|
|
45
|
+
return Criterion(
|
|
46
|
+
name=name or _slugify(question, max_len=40),
|
|
47
|
+
kind="judge",
|
|
48
|
+
weight=weight,
|
|
49
|
+
question=question,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def code(
|
|
54
|
+
fn: Callable[..., float] | Callable[..., int] | Callable[..., bool],
|
|
55
|
+
*,
|
|
56
|
+
name: str | None = None,
|
|
57
|
+
weight: float = 1.0,
|
|
58
|
+
) -> Criterion:
|
|
59
|
+
"""Define a code criterion from a callable.
|
|
60
|
+
|
|
61
|
+
The callable's signature is introspected:
|
|
62
|
+
- 1 arg → called with `output` only
|
|
63
|
+
- 2 args → called with `(input, output)`
|
|
64
|
+
Returns a float in [-1, 1]; ints/bools are auto-cast.
|
|
65
|
+
"""
|
|
66
|
+
resolved_name = name or _resolve_callable_name(fn)
|
|
67
|
+
src = _capture_source_as_def(fn, resolved_name)
|
|
68
|
+
return Criterion(
|
|
69
|
+
name=resolved_name,
|
|
70
|
+
kind="code",
|
|
71
|
+
weight=weight,
|
|
72
|
+
fn=fn,
|
|
73
|
+
source_code=src,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def evaluate_criterion(
|
|
78
|
+
c: Criterion,
|
|
79
|
+
program_input: Any,
|
|
80
|
+
program_output: Any,
|
|
81
|
+
llm: Any,
|
|
82
|
+
) -> tuple[float, str]:
|
|
83
|
+
"""Run one criterion. Returns (score, reasoning). Score is clamped to [-1, 1]."""
|
|
84
|
+
if c.kind == "code":
|
|
85
|
+
return _evaluate_code(c, program_input, program_output)
|
|
86
|
+
return _evaluate_judge(c, program_input, program_output, llm)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _evaluate_code(c: Criterion, program_input: Any, program_output: Any) -> tuple[float, str]:
|
|
90
|
+
assert c.fn is not None
|
|
91
|
+
try:
|
|
92
|
+
params = list(inspect.signature(c.fn).parameters.values())
|
|
93
|
+
except (TypeError, ValueError):
|
|
94
|
+
params = []
|
|
95
|
+
try:
|
|
96
|
+
if len(params) == 1:
|
|
97
|
+
raw = c.fn(program_output)
|
|
98
|
+
elif len(params) >= 2:
|
|
99
|
+
raw = c.fn(program_input, program_output)
|
|
100
|
+
else:
|
|
101
|
+
raw = c.fn()
|
|
102
|
+
except Exception as e:
|
|
103
|
+
return -1.0, f"code criterion raised {type(e).__name__}: {e}"
|
|
104
|
+
value = float(raw)
|
|
105
|
+
return _clamp(value), f"code returned {value:.4f}"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _evaluate_judge(
|
|
109
|
+
c: Criterion,
|
|
110
|
+
program_input: Any,
|
|
111
|
+
program_output: Any,
|
|
112
|
+
llm: Any,
|
|
113
|
+
) -> tuple[float, str]:
|
|
114
|
+
prompt = (
|
|
115
|
+
f"You are a strict but fair judge. Score the OUTPUT against the RUBRIC.\n\n"
|
|
116
|
+
f"RUBRIC: {c.question}\n\n"
|
|
117
|
+
f"INPUT:\n{program_input}\n\n"
|
|
118
|
+
f"OUTPUT:\n{program_output}\n\n"
|
|
119
|
+
f"Reply with score in [-1, 1] (-1 = fails entirely; 0 = neutral; +1 = perfectly satisfies) "
|
|
120
|
+
f"and concise reasoning."
|
|
121
|
+
)
|
|
122
|
+
try:
|
|
123
|
+
resp = llm(prompt, schema=_JudgeResponse)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
return 0.0, f"judge LLM failed ({type(e).__name__}: {e}); neutral score"
|
|
126
|
+
return _clamp(resp.score), resp.reasoning
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _clamp(x: float) -> float:
|
|
130
|
+
if x != x:
|
|
131
|
+
return 0.0
|
|
132
|
+
return max(-1.0, min(1.0, x))
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _slugify(text: str, *, max_len: int = 40) -> str:
|
|
136
|
+
s = re.sub(r"[^a-zA-Z0-9]+", "_", text.lower()).strip("_")
|
|
137
|
+
return s[:max_len] or "criterion"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _resolve_callable_name(fn: Callable) -> str:
|
|
141
|
+
name = getattr(fn, "__name__", "code_criterion")
|
|
142
|
+
if name == "<lambda>":
|
|
143
|
+
return "code_criterion"
|
|
144
|
+
return name
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _capture_source_as_def(fn: Callable, name: str) -> str | None:
|
|
148
|
+
"""Return source as a `def {name}(args): ...` string, converting lambdas.
|
|
149
|
+
|
|
150
|
+
Falls back to None if no source is available.
|
|
151
|
+
"""
|
|
152
|
+
try:
|
|
153
|
+
raw = inspect.getsource(fn)
|
|
154
|
+
except (OSError, TypeError):
|
|
155
|
+
return None
|
|
156
|
+
raw = raw.strip()
|
|
157
|
+
|
|
158
|
+
if raw.lstrip().startswith("def "):
|
|
159
|
+
return raw
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
tree = ast.parse(raw)
|
|
163
|
+
except SyntaxError:
|
|
164
|
+
# source line may include surrounding context — try to find the lambda
|
|
165
|
+
for chunk in (raw, raw.split("=", 1)[-1].strip() if "=" in raw else raw):
|
|
166
|
+
try:
|
|
167
|
+
tree = ast.parse(chunk, mode="eval")
|
|
168
|
+
break
|
|
169
|
+
except SyntaxError:
|
|
170
|
+
continue
|
|
171
|
+
else:
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
lambda_node = None
|
|
175
|
+
for node in ast.walk(tree):
|
|
176
|
+
if isinstance(node, ast.Lambda):
|
|
177
|
+
lambda_node = node
|
|
178
|
+
break
|
|
179
|
+
if lambda_node is None:
|
|
180
|
+
return None
|
|
181
|
+
args_src = ast.unparse(lambda_node.args)
|
|
182
|
+
body_src = ast.unparse(lambda_node.body)
|
|
183
|
+
return f"def {name}({args_src}):\n return {body_src}\n"
|
evolvers/evolvable.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
1
|
+
"""Evolvable: a function + criteria + LLM, with train/evaluate/save/load."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
import inspect
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import sys
|
|
11
|
+
import textwrap
|
|
12
|
+
import time
|
|
13
|
+
import traceback
|
|
14
|
+
from collections.abc import Callable, Iterable
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from tqdm.auto import tqdm
|
|
20
|
+
|
|
21
|
+
from .criterion import Criterion, evaluate_criterion
|
|
22
|
+
from .llm import LLM
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _log(msg: str) -> None:
|
|
26
|
+
"""Timestamped, flushed log line on stderr."""
|
|
27
|
+
print(f"[{time.strftime('%H:%M:%S')}] evolvers: {msg}", file=sys.stderr, flush=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Evolvable:
|
|
31
|
+
"""A function whose body the optimizer rewrites to maximize criteria scores.
|
|
32
|
+
|
|
33
|
+
The function may take an `llm` parameter; if present, the bound LLM is auto-injected
|
|
34
|
+
on each call. Persistence is to a directory under EVOLVERS_CACHE (default ~/.cache/evolvers/).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
fn: Callable[..., Any],
|
|
40
|
+
criteria: list[Criterion],
|
|
41
|
+
llm: LLM,
|
|
42
|
+
*,
|
|
43
|
+
_source: str | None = None,
|
|
44
|
+
_signature: inspect.Signature | None = None,
|
|
45
|
+
):
|
|
46
|
+
self.llm = llm
|
|
47
|
+
self.criteria = list(criteria)
|
|
48
|
+
self._signature = _signature or inspect.signature(fn)
|
|
49
|
+
self._source = _source if _source is not None else _get_source(fn)
|
|
50
|
+
self._compiled = fn
|
|
51
|
+
self._best_source = self._source
|
|
52
|
+
self._best_score: float | None = None
|
|
53
|
+
self.history: list[dict[str, Any]] = []
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def source(self) -> str:
|
|
57
|
+
return self._best_source
|
|
58
|
+
|
|
59
|
+
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
60
|
+
if "llm" in self._signature.parameters and "llm" not in kwargs:
|
|
61
|
+
kwargs["llm"] = self.llm
|
|
62
|
+
return self._compiled(*args, **kwargs)
|
|
63
|
+
|
|
64
|
+
def set_llm(self, llm: LLM) -> Evolvable:
|
|
65
|
+
self.llm = llm
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def clone(self) -> Evolvable:
|
|
69
|
+
new = Evolvable.__new__(Evolvable)
|
|
70
|
+
new.llm = self.llm
|
|
71
|
+
new.criteria = copy.deepcopy(self.criteria)
|
|
72
|
+
new._signature = self._signature
|
|
73
|
+
new._source = self._source
|
|
74
|
+
new._best_source = self._best_source
|
|
75
|
+
new._best_score = self._best_score
|
|
76
|
+
new._compiled = _compile_fn(self._best_source)
|
|
77
|
+
new.history = []
|
|
78
|
+
return new
|
|
79
|
+
|
|
80
|
+
def evaluate(
|
|
81
|
+
self,
|
|
82
|
+
dataset: Iterable[Any],
|
|
83
|
+
*,
|
|
84
|
+
show_progress: bool = True,
|
|
85
|
+
max_workers: int = 8,
|
|
86
|
+
) -> dict[str, Any]:
|
|
87
|
+
return self._run_eval(list(dataset), label="eval", show_progress=show_progress, max_workers=max_workers)
|
|
88
|
+
|
|
89
|
+
def train(
|
|
90
|
+
self,
|
|
91
|
+
dataset: Iterable[Any],
|
|
92
|
+
*,
|
|
93
|
+
budget: int = 20,
|
|
94
|
+
show_progress: bool = True,
|
|
95
|
+
max_workers: int = 8,
|
|
96
|
+
) -> dict[str, Any]:
|
|
97
|
+
data = list(dataset)
|
|
98
|
+
_log(f"train: budget={budget}, dataset_size={len(data)}, criteria={[c.name for c in self.criteria]}")
|
|
99
|
+
|
|
100
|
+
_log("train: running baseline eval...")
|
|
101
|
+
baseline = self._run_eval(data, label="baseline", show_progress=show_progress, max_workers=max_workers)
|
|
102
|
+
self._best_score = baseline["aggregate"]
|
|
103
|
+
self._best_source = self._source
|
|
104
|
+
self.history.append(
|
|
105
|
+
{
|
|
106
|
+
"attempt": 0,
|
|
107
|
+
"source": self._source,
|
|
108
|
+
"score": baseline["aggregate"],
|
|
109
|
+
"per_criterion": baseline["per_criterion"],
|
|
110
|
+
"result": baseline,
|
|
111
|
+
"accepted": True,
|
|
112
|
+
"kind": "baseline",
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
iterator: Iterable[int] = range(1, budget + 1)
|
|
117
|
+
if show_progress:
|
|
118
|
+
iterator = tqdm(iterator, desc="evolve", total=budget)
|
|
119
|
+
|
|
120
|
+
for attempt in iterator:
|
|
121
|
+
entry: dict[str, Any] = {"attempt": attempt, "accepted": False}
|
|
122
|
+
_log(f"train: attempt {attempt}/{budget} — proposing mutation...")
|
|
123
|
+
t_propose = time.perf_counter()
|
|
124
|
+
try:
|
|
125
|
+
new_source = self._propose_mutation()
|
|
126
|
+
except Exception as e:
|
|
127
|
+
entry["error"] = f"propose failed: {type(e).__name__}: {e}"
|
|
128
|
+
_log(f"train: attempt {attempt} — {entry['error']}")
|
|
129
|
+
self.history.append(entry)
|
|
130
|
+
continue
|
|
131
|
+
propose_elapsed = time.perf_counter() - t_propose
|
|
132
|
+
_log(f"train: attempt {attempt} — mutation proposed ({propose_elapsed:.1f}s, {len(new_source)} chars)")
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
new_fn = _compile_fn(new_source)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
entry["source"] = new_source
|
|
138
|
+
entry["error"] = f"compile failed: {type(e).__name__}: {e}"
|
|
139
|
+
_log(f"train: attempt {attempt} — compile failed: {e}")
|
|
140
|
+
self.history.append(entry)
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
prev_compiled = self._compiled
|
|
144
|
+
self._compiled = new_fn
|
|
145
|
+
_log(f"train: attempt {attempt} — evaluating new candidate...")
|
|
146
|
+
try:
|
|
147
|
+
result = self._run_eval(data, label=f"attempt {attempt}", show_progress=False, max_workers=max_workers)
|
|
148
|
+
entry["score"] = result["aggregate"]
|
|
149
|
+
entry["per_criterion"] = result["per_criterion"]
|
|
150
|
+
entry["source"] = new_source
|
|
151
|
+
entry["result"] = result
|
|
152
|
+
best = self._best_score if self._best_score is not None else float("-inf")
|
|
153
|
+
if result["aggregate"] > best:
|
|
154
|
+
self._best_score = result["aggregate"]
|
|
155
|
+
self._best_source = new_source
|
|
156
|
+
entry["accepted"] = True
|
|
157
|
+
_log(f"train: attempt {attempt} — ACCEPTED, new best aggregate={result['aggregate']:.3f}")
|
|
158
|
+
else:
|
|
159
|
+
self._compiled = prev_compiled
|
|
160
|
+
_log(
|
|
161
|
+
f"train: attempt {attempt} — REVERTED, "
|
|
162
|
+
f"aggregate={result['aggregate']:.3f} <= best={self._best_score:.3f}"
|
|
163
|
+
)
|
|
164
|
+
except Exception as e:
|
|
165
|
+
self._compiled = prev_compiled
|
|
166
|
+
entry["source"] = new_source
|
|
167
|
+
entry["error"] = f"eval failed: {type(e).__name__}: {e}"
|
|
168
|
+
entry["traceback"] = traceback.format_exc(limit=3)
|
|
169
|
+
_log(f"train: attempt {attempt} — eval crashed: {e}")
|
|
170
|
+
|
|
171
|
+
self.history.append(entry)
|
|
172
|
+
|
|
173
|
+
self._compiled = _compile_fn(self._best_source)
|
|
174
|
+
self._source = self._best_source
|
|
175
|
+
_log(f"train: done. best_score={self._best_score:.3f}")
|
|
176
|
+
return {
|
|
177
|
+
"best_score": self._best_score,
|
|
178
|
+
"best_source": self._best_source,
|
|
179
|
+
"history": self.history,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
def save(self, uri: str) -> Path:
|
|
183
|
+
path = _cache_dir(uri)
|
|
184
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
185
|
+
|
|
186
|
+
(path / "program.py").write_text(self._best_source)
|
|
187
|
+
|
|
188
|
+
crit_dir = path / "criteria"
|
|
189
|
+
crit_dir.mkdir(exist_ok=True)
|
|
190
|
+
criteria_meta: list[dict[str, Any]] = []
|
|
191
|
+
for c in self.criteria:
|
|
192
|
+
entry = {"name": c.name, "kind": c.kind, "weight": c.weight}
|
|
193
|
+
if c.kind == "judge":
|
|
194
|
+
(crit_dir / f"{c.name}.judge.txt").write_text(c.question or "")
|
|
195
|
+
else:
|
|
196
|
+
src = c.source_code or _fallback_code_source(c)
|
|
197
|
+
(crit_dir / f"{c.name}.code.py").write_text(src)
|
|
198
|
+
entry["source_available"] = c.source_code is not None
|
|
199
|
+
criteria_meta.append(entry)
|
|
200
|
+
|
|
201
|
+
traces_dir = path / "traces"
|
|
202
|
+
traces_dir.mkdir(exist_ok=True)
|
|
203
|
+
if self.history:
|
|
204
|
+
with (traces_dir / "train.jsonl").open("w") as f:
|
|
205
|
+
for h in self.history:
|
|
206
|
+
f.write(json.dumps(_jsonable(h)) + "\n")
|
|
207
|
+
|
|
208
|
+
manifest = {
|
|
209
|
+
"uri": uri,
|
|
210
|
+
"signature": str(self._signature),
|
|
211
|
+
"function_name": _extract_def_name(self._best_source),
|
|
212
|
+
"llm": {"model": self.llm.model, "provider": self.llm.provider, "base_url": self.llm.base_url},
|
|
213
|
+
"variant": uri.split(":", 1)[1] if ":" in uri else None,
|
|
214
|
+
"criteria": criteria_meta,
|
|
215
|
+
"best_score": self._best_score,
|
|
216
|
+
"saved_at": time.time(),
|
|
217
|
+
}
|
|
218
|
+
(path / "manifest.json").write_text(json.dumps(manifest, indent=2))
|
|
219
|
+
_log(f"saved to {path}")
|
|
220
|
+
return path
|
|
221
|
+
|
|
222
|
+
@classmethod
|
|
223
|
+
def load(cls, uri: str, *, llm: LLM | None = None) -> Evolvable:
|
|
224
|
+
path = _cache_dir(uri)
|
|
225
|
+
if not path.exists():
|
|
226
|
+
raise FileNotFoundError(f"No artifact at {path}")
|
|
227
|
+
manifest = json.loads((path / "manifest.json").read_text())
|
|
228
|
+
|
|
229
|
+
source = (path / "program.py").read_text()
|
|
230
|
+
fn = _compile_fn(source)
|
|
231
|
+
|
|
232
|
+
criteria: list[Criterion] = []
|
|
233
|
+
for cmeta in manifest["criteria"]:
|
|
234
|
+
name, kind, weight = cmeta["name"], cmeta["kind"], cmeta["weight"]
|
|
235
|
+
if kind == "judge":
|
|
236
|
+
q = (path / "criteria" / f"{name}.judge.txt").read_text()
|
|
237
|
+
criteria.append(Criterion(name=name, kind="judge", weight=weight, question=q))
|
|
238
|
+
else:
|
|
239
|
+
src = (path / "criteria" / f"{name}.code.py").read_text()
|
|
240
|
+
code_fn = _compile_fn(src)
|
|
241
|
+
criteria.append(Criterion(name=name, kind="code", weight=weight, fn=code_fn, source_code=src))
|
|
242
|
+
|
|
243
|
+
if llm is None:
|
|
244
|
+
llm_meta = manifest.get("llm", {})
|
|
245
|
+
llm = LLM(model=llm_meta.get("model", ""), base_url=llm_meta.get("base_url"))
|
|
246
|
+
|
|
247
|
+
instance = cls.__new__(cls)
|
|
248
|
+
instance.llm = llm
|
|
249
|
+
instance.criteria = criteria
|
|
250
|
+
instance._signature = inspect.signature(fn)
|
|
251
|
+
instance._source = source
|
|
252
|
+
instance._best_source = source
|
|
253
|
+
instance._best_score = manifest.get("best_score")
|
|
254
|
+
instance._compiled = fn
|
|
255
|
+
instance.history = []
|
|
256
|
+
_log(f"loaded {uri} (best_score={manifest.get('best_score')})")
|
|
257
|
+
return instance
|
|
258
|
+
|
|
259
|
+
def _run_one_trial(self, row: Any, idx: int, total: int, max_workers: int) -> dict[str, Any]:
|
|
260
|
+
program_input, call_args, call_kwargs = _row_to_call(row, self._signature)
|
|
261
|
+
_log(f" trial {idx + 1}/{total} starting (input={_truncate(repr(program_input), 80)})")
|
|
262
|
+
t0 = time.perf_counter()
|
|
263
|
+
try:
|
|
264
|
+
output = self(*call_args, **call_kwargs)
|
|
265
|
+
err = None
|
|
266
|
+
except Exception as e:
|
|
267
|
+
output = None
|
|
268
|
+
err = f"{type(e).__name__}: {e}"
|
|
269
|
+
program_latency_ms = (time.perf_counter() - t0) * 1000
|
|
270
|
+
|
|
271
|
+
per_criterion: dict[str, dict[str, Any]] = {}
|
|
272
|
+
if output is None:
|
|
273
|
+
for c in self.criteria:
|
|
274
|
+
per_criterion[c.name] = {"score": -1.0, "reasoning": f"program failed: {err}"}
|
|
275
|
+
else:
|
|
276
|
+
with ThreadPoolExecutor(max_workers=min(max_workers, max(1, len(self.criteria)))) as ex:
|
|
277
|
+
results = list(
|
|
278
|
+
ex.map(
|
|
279
|
+
lambda c: (c.name, evaluate_criterion(c, program_input, output, self.llm)),
|
|
280
|
+
self.criteria,
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
for name, (score, reasoning) in results:
|
|
284
|
+
per_criterion[name] = {"score": score, "reasoning": reasoning}
|
|
285
|
+
|
|
286
|
+
scores_summary = {k: round(v["score"], 2) for k, v in per_criterion.items()}
|
|
287
|
+
elapsed = time.perf_counter() - t0
|
|
288
|
+
_log(
|
|
289
|
+
f" trial {idx + 1}/{total} done ({elapsed:.1f}s, "
|
|
290
|
+
f"output={_truncate(repr(output), 80)}, scores={scores_summary})"
|
|
291
|
+
)
|
|
292
|
+
return {
|
|
293
|
+
"input": program_input,
|
|
294
|
+
"output": output,
|
|
295
|
+
"error": err,
|
|
296
|
+
"latency_ms": program_latency_ms,
|
|
297
|
+
"per_criterion": per_criterion,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
def _run_eval(
|
|
301
|
+
self,
|
|
302
|
+
data: list[Any],
|
|
303
|
+
*,
|
|
304
|
+
label: str,
|
|
305
|
+
show_progress: bool,
|
|
306
|
+
max_workers: int,
|
|
307
|
+
) -> dict[str, Any]:
|
|
308
|
+
n = len(data)
|
|
309
|
+
_log(f"eval [{label}]: starting on {n} rows (max_workers={max_workers})")
|
|
310
|
+
t0 = time.perf_counter()
|
|
311
|
+
|
|
312
|
+
trials_indexed: list[tuple[int, dict[str, Any]]] = []
|
|
313
|
+
if n == 1 or max_workers <= 1:
|
|
314
|
+
for idx, row in enumerate(data):
|
|
315
|
+
trials_indexed.append((idx, self._run_one_trial(row, idx, n, max_workers)))
|
|
316
|
+
else:
|
|
317
|
+
with ThreadPoolExecutor(max_workers=min(max_workers, n)) as ex:
|
|
318
|
+
futures = {
|
|
319
|
+
ex.submit(self._run_one_trial, row, idx, n, max_workers): idx for idx, row in enumerate(data)
|
|
320
|
+
}
|
|
321
|
+
for fut in futures:
|
|
322
|
+
idx = futures[fut]
|
|
323
|
+
trials_indexed.append((idx, fut.result()))
|
|
324
|
+
|
|
325
|
+
trials_indexed.sort(key=lambda x: x[0])
|
|
326
|
+
trials = [t for _, t in trials_indexed]
|
|
327
|
+
|
|
328
|
+
per_criterion_mean: dict[str, float] = {}
|
|
329
|
+
total_weight = sum(c.weight for c in self.criteria) or 1.0
|
|
330
|
+
for c in self.criteria:
|
|
331
|
+
scores = [t["per_criterion"][c.name]["score"] for t in trials]
|
|
332
|
+
per_criterion_mean[c.name] = sum(scores) / max(1, len(scores))
|
|
333
|
+
aggregate = sum(per_criterion_mean[c.name] * c.weight for c in self.criteria) / total_weight
|
|
334
|
+
|
|
335
|
+
elapsed = time.perf_counter() - t0
|
|
336
|
+
_log(
|
|
337
|
+
f"eval [{label}]: done in {elapsed:.1f}s — aggregate={aggregate:.3f} "
|
|
338
|
+
f"per_criterion={ {k: round(v, 3) for k, v in per_criterion_mean.items()} }"
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
return {
|
|
342
|
+
"aggregate": aggregate,
|
|
343
|
+
"per_criterion": per_criterion_mean,
|
|
344
|
+
"trials": trials,
|
|
345
|
+
"label": label,
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
def _propose_mutation(self) -> str:
|
|
349
|
+
recent = [h for h in self.history if "score" in h][-3:]
|
|
350
|
+
if not recent:
|
|
351
|
+
recent = [{"attempt": 0, "source": self._best_source, "score": self._best_score or 0.0}]
|
|
352
|
+
|
|
353
|
+
criteria_desc = "\n".join(
|
|
354
|
+
f"- {c.name} (weight={c.weight:.2f}, kind={c.kind}): "
|
|
355
|
+
+ (c.question if c.kind == "judge" else (c.source_code or "<code>"))
|
|
356
|
+
for c in self.criteria
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
history_lines = []
|
|
360
|
+
for h in recent:
|
|
361
|
+
score = h.get("score") if h.get("score") is not None else 0.0
|
|
362
|
+
block = f"Attempt {h['attempt']} (score={score:.3f}):\n```python\n{h.get('source', '')}\n```"
|
|
363
|
+
history_lines.append(block)
|
|
364
|
+
history_block = "\n\n".join(history_lines)
|
|
365
|
+
|
|
366
|
+
last_trials_block = ""
|
|
367
|
+
for h in reversed(recent):
|
|
368
|
+
result = h.get("result") or {}
|
|
369
|
+
trials = result.get("trials", [])
|
|
370
|
+
if trials:
|
|
371
|
+
lines = []
|
|
372
|
+
for t in trials[:3]:
|
|
373
|
+
score_summary = {k: round(v["score"], 2) for k, v in t["per_criterion"].items()}
|
|
374
|
+
reasoning_summary = {
|
|
375
|
+
k: _truncate(v.get("reasoning", "") or "", 200) for k, v in t["per_criterion"].items()
|
|
376
|
+
}
|
|
377
|
+
lines.append(
|
|
378
|
+
f"- input: {_truncate(repr(t['input']), 200)}\n"
|
|
379
|
+
f" output: {_truncate(repr(t['output']), 400)}\n"
|
|
380
|
+
f" scores: {score_summary}\n"
|
|
381
|
+
f" judge_reasoning: {reasoning_summary}"
|
|
382
|
+
)
|
|
383
|
+
last_trials_block = "Sample trials from last evaluation:\n" + "\n".join(lines)
|
|
384
|
+
break
|
|
385
|
+
|
|
386
|
+
best_score = self._best_score if self._best_score is not None else 0.0
|
|
387
|
+
prompt = textwrap.dedent(f"""\
|
|
388
|
+
You are optimizing a Python function. Goal: maximize the weighted-mean criterion score (range [-1, 1]).
|
|
389
|
+
|
|
390
|
+
Current best implementation (score={best_score:.3f}):
|
|
391
|
+
```python
|
|
392
|
+
{self._best_source}
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
Criteria:
|
|
396
|
+
{criteria_desc}
|
|
397
|
+
|
|
398
|
+
Recent attempts:
|
|
399
|
+
{history_block}
|
|
400
|
+
|
|
401
|
+
{last_trials_block}
|
|
402
|
+
|
|
403
|
+
You may use the injected `llm` callable inside the function. Its signature:
|
|
404
|
+
llm(prompt: str, *, schema: type[BaseModel] | None = None,
|
|
405
|
+
system: str | None = None) -> str | BaseModel
|
|
406
|
+
llm.batch(prompts: list[str], **kwargs) -> list
|
|
407
|
+
|
|
408
|
+
IMPORTANT — token budgets:
|
|
409
|
+
- The injected `llm` is a 2026-era reasoning model. It uses substantial tokens for internal
|
|
410
|
+
thinking BEFORE producing visible content. Defaults are already tuned for this.
|
|
411
|
+
- DO NOT pass `max_tokens` to `llm()`. The default is set to a high value (32k+).
|
|
412
|
+
Passing a small `max_tokens` (e.g. 100, 500, 1024) will cause the reasoning preamble
|
|
413
|
+
to consume the entire budget, leaving content empty and the function returning ''.
|
|
414
|
+
|
|
415
|
+
Rules:
|
|
416
|
+
- Preserve the function signature: {self._signature}.
|
|
417
|
+
- Reply with ONLY the function source as a single Python code block (```python ... ```).
|
|
418
|
+
- Do not include explanations outside the code block.
|
|
419
|
+
""")
|
|
420
|
+
|
|
421
|
+
response = self.llm(
|
|
422
|
+
prompt,
|
|
423
|
+
system=(
|
|
424
|
+
"You are an expert Python programmer iteratively refining a function under criteria. "
|
|
425
|
+
"You target 2026-era reasoning LLMs and never artificially cap token budgets."
|
|
426
|
+
),
|
|
427
|
+
)
|
|
428
|
+
if not isinstance(response, str):
|
|
429
|
+
response = str(response)
|
|
430
|
+
return _extract_python(response)
|
|
431
|
+
|
|
432
|
+
def __repr__(self) -> str:
|
|
433
|
+
name = _extract_def_name(self._best_source) or "<evolvable>"
|
|
434
|
+
return f"Evolvable({name}, criteria={[c.name for c in self.criteria]}, llm={self.llm.model})"
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _compile_fn(source: str) -> Callable[..., Any]:
|
|
438
|
+
ns: dict[str, Any] = {}
|
|
439
|
+
exec(compile(source, "<evolvable>", "exec"), ns)
|
|
440
|
+
fns = [v for v in ns.values() if inspect.isfunction(v)]
|
|
441
|
+
if not fns:
|
|
442
|
+
raise ValueError("no function found in source")
|
|
443
|
+
return fns[-1]
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _get_source(fn: Callable) -> str:
|
|
447
|
+
try:
|
|
448
|
+
return textwrap.dedent(inspect.getsource(fn))
|
|
449
|
+
except (OSError, TypeError) as e:
|
|
450
|
+
raise ValueError(f"cannot get source for {fn}: {e}") from e
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _extract_python(text: str) -> str:
|
|
454
|
+
m = re.search(r"```(?:python)?\s*\n(.*?)```", text, re.DOTALL)
|
|
455
|
+
if m:
|
|
456
|
+
return m.group(1).strip()
|
|
457
|
+
return text.strip()
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _extract_def_name(source: str) -> str | None:
|
|
461
|
+
m = re.search(r"^def\s+(\w+)\s*\(", source, re.MULTILINE)
|
|
462
|
+
return m.group(1) if m else None
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def _row_to_call(row: Any, sig: inspect.Signature) -> tuple[Any, tuple, dict]:
|
|
466
|
+
params = [p for p in sig.parameters.values() if p.name != "llm"]
|
|
467
|
+
if isinstance(row, dict):
|
|
468
|
+
kwargs = {k: v for k, v in row.items() if k in {p.name for p in sig.parameters}}
|
|
469
|
+
first_param = params[0].name if params else None
|
|
470
|
+
program_input = kwargs.get(first_param) if first_param else row
|
|
471
|
+
return program_input, (), kwargs
|
|
472
|
+
return row, (row,), {}
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def _cache_dir(uri: str) -> Path:
|
|
476
|
+
root = Path(os.environ.get("EVOLVERS_CACHE", "~/.cache/evolvers")).expanduser()
|
|
477
|
+
return root / uri
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _fallback_code_source(c: Criterion) -> str:
|
|
481
|
+
return f"# source not captured for {c.name!r}\ndef {c.name}(output):\n return 0.0\n"
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def _jsonable(obj: Any) -> Any:
|
|
485
|
+
try:
|
|
486
|
+
json.dumps(obj)
|
|
487
|
+
return obj
|
|
488
|
+
except (TypeError, ValueError):
|
|
489
|
+
if isinstance(obj, dict):
|
|
490
|
+
return {k: _jsonable(v) for k, v in obj.items()}
|
|
491
|
+
if isinstance(obj, (list, tuple)):
|
|
492
|
+
return [_jsonable(v) for v in obj]
|
|
493
|
+
return repr(obj)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _truncate(s: str, n: int) -> str:
|
|
497
|
+
return s if len(s) <= n else s[: n - 3] + "..."
|