evolvers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evolvers/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """evolvers: evolvable AI programs."""
2
+
3
+ from .criterion import Criterion, code, judge
4
+ from .evolvable import Evolvable
5
+ from .llm import LLM
6
+
7
+ __all__ = ["LLM", "Criterion", "Evolvable", "code", "judge"]
8
+ __version__ = "0.1.0"
evolvers/criterion.py ADDED
@@ -0,0 +1,183 @@
1
+ """Criterion: judge (LLM-as-judge) and code (Python function) rubrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ast
6
+ import inspect
7
+ import re
8
+ from collections.abc import Callable
9
+ from dataclasses import dataclass
10
+ from typing import Any, Literal
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+ Kind = Literal["judge", "code"]
15
+
16
+
17
+ class _JudgeResponse(BaseModel):
18
+ score: float = Field(ge=-1.0, le=1.0)
19
+ reasoning: str
20
+
21
+
22
+ @dataclass
23
+ class Criterion:
24
+ """One rubric. Either a natural-language judge or a Python function."""
25
+
26
+ name: str
27
+ kind: Kind
28
+ weight: float = 1.0
29
+ question: str | None = None # for kind="judge"
30
+ fn: Callable[..., float] | None = None # for kind="code"
31
+ source_code: str | None = None # captured for kind="code", to enable save/load
32
+
33
+ def __post_init__(self) -> None:
34
+ if self.kind == "judge" and not self.question:
35
+ raise ValueError(f"judge criterion {self.name!r} needs a `question`")
36
+ if self.kind == "code" and self.fn is None:
37
+ raise ValueError(f"code criterion {self.name!r} needs a `fn`")
38
+
39
+
40
+ def judge(question: str, *, name: str | None = None, weight: float = 1.0) -> Criterion:
41
+ """Define an LLM-as-judge criterion from a natural-language question.
42
+
43
+ The judge scores in [-1, 1]. -1 = fails entirely; +1 = perfectly satisfies.
44
+ """
45
+ return Criterion(
46
+ name=name or _slugify(question, max_len=40),
47
+ kind="judge",
48
+ weight=weight,
49
+ question=question,
50
+ )
51
+
52
+
53
+ def code(
54
+ fn: Callable[..., float] | Callable[..., int] | Callable[..., bool],
55
+ *,
56
+ name: str | None = None,
57
+ weight: float = 1.0,
58
+ ) -> Criterion:
59
+ """Define a code criterion from a callable.
60
+
61
+ The callable's signature is introspected:
62
+ - 1 arg → called with `output` only
63
+ - 2 args → called with `(input, output)`
64
+ Returns a float in [-1, 1]; ints/bools are auto-cast.
65
+ """
66
+ resolved_name = name or _resolve_callable_name(fn)
67
+ src = _capture_source_as_def(fn, resolved_name)
68
+ return Criterion(
69
+ name=resolved_name,
70
+ kind="code",
71
+ weight=weight,
72
+ fn=fn,
73
+ source_code=src,
74
+ )
75
+
76
+
77
+ def evaluate_criterion(
78
+ c: Criterion,
79
+ program_input: Any,
80
+ program_output: Any,
81
+ llm: Any,
82
+ ) -> tuple[float, str]:
83
+ """Run one criterion. Returns (score, reasoning). Score is clamped to [-1, 1]."""
84
+ if c.kind == "code":
85
+ return _evaluate_code(c, program_input, program_output)
86
+ return _evaluate_judge(c, program_input, program_output, llm)
87
+
88
+
89
+ def _evaluate_code(c: Criterion, program_input: Any, program_output: Any) -> tuple[float, str]:
90
+ assert c.fn is not None
91
+ try:
92
+ params = list(inspect.signature(c.fn).parameters.values())
93
+ except (TypeError, ValueError):
94
+ params = []
95
+ try:
96
+ if len(params) == 1:
97
+ raw = c.fn(program_output)
98
+ elif len(params) >= 2:
99
+ raw = c.fn(program_input, program_output)
100
+ else:
101
+ raw = c.fn()
102
+ except Exception as e:
103
+ return -1.0, f"code criterion raised {type(e).__name__}: {e}"
104
+ value = float(raw)
105
+ return _clamp(value), f"code returned {value:.4f}"
106
+
107
+
108
+ def _evaluate_judge(
109
+ c: Criterion,
110
+ program_input: Any,
111
+ program_output: Any,
112
+ llm: Any,
113
+ ) -> tuple[float, str]:
114
+ prompt = (
115
+ f"You are a strict but fair judge. Score the OUTPUT against the RUBRIC.\n\n"
116
+ f"RUBRIC: {c.question}\n\n"
117
+ f"INPUT:\n{program_input}\n\n"
118
+ f"OUTPUT:\n{program_output}\n\n"
119
+ f"Reply with score in [-1, 1] (-1 = fails entirely; 0 = neutral; +1 = perfectly satisfies) "
120
+ f"and concise reasoning."
121
+ )
122
+ try:
123
+ resp = llm(prompt, schema=_JudgeResponse)
124
+ except Exception as e:
125
+ return 0.0, f"judge LLM failed ({type(e).__name__}: {e}); neutral score"
126
+ return _clamp(resp.score), resp.reasoning
127
+
128
+
129
+ def _clamp(x: float) -> float:
130
+ if x != x:
131
+ return 0.0
132
+ return max(-1.0, min(1.0, x))
133
+
134
+
135
+ def _slugify(text: str, *, max_len: int = 40) -> str:
136
+ s = re.sub(r"[^a-zA-Z0-9]+", "_", text.lower()).strip("_")
137
+ return s[:max_len] or "criterion"
138
+
139
+
140
+ def _resolve_callable_name(fn: Callable) -> str:
141
+ name = getattr(fn, "__name__", "code_criterion")
142
+ if name == "<lambda>":
143
+ return "code_criterion"
144
+ return name
145
+
146
+
147
+ def _capture_source_as_def(fn: Callable, name: str) -> str | None:
148
+ """Return source as a `def {name}(args): ...` string, converting lambdas.
149
+
150
+ Falls back to None if no source is available.
151
+ """
152
+ try:
153
+ raw = inspect.getsource(fn)
154
+ except (OSError, TypeError):
155
+ return None
156
+ raw = raw.strip()
157
+
158
+ if raw.lstrip().startswith("def "):
159
+ return raw
160
+
161
+ try:
162
+ tree = ast.parse(raw)
163
+ except SyntaxError:
164
+ # source line may include surrounding context — try to find the lambda
165
+ for chunk in (raw, raw.split("=", 1)[-1].strip() if "=" in raw else raw):
166
+ try:
167
+ tree = ast.parse(chunk, mode="eval")
168
+ break
169
+ except SyntaxError:
170
+ continue
171
+ else:
172
+ return None
173
+
174
+ lambda_node = None
175
+ for node in ast.walk(tree):
176
+ if isinstance(node, ast.Lambda):
177
+ lambda_node = node
178
+ break
179
+ if lambda_node is None:
180
+ return None
181
+ args_src = ast.unparse(lambda_node.args)
182
+ body_src = ast.unparse(lambda_node.body)
183
+ return f"def {name}({args_src}):\n return {body_src}\n"
evolvers/evolvable.py ADDED
@@ -0,0 +1,497 @@
1
+ """Evolvable: a function + criteria + LLM, with train/evaluate/save/load."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ import inspect
7
+ import json
8
+ import os
9
+ import re
10
+ import sys
11
+ import textwrap
12
+ import time
13
+ import traceback
14
+ from collections.abc import Callable, Iterable
15
+ from concurrent.futures import ThreadPoolExecutor
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from tqdm.auto import tqdm
20
+
21
+ from .criterion import Criterion, evaluate_criterion
22
+ from .llm import LLM
23
+
24
+
25
+ def _log(msg: str) -> None:
26
+ """Timestamped, flushed log line on stderr."""
27
+ print(f"[{time.strftime('%H:%M:%S')}] evolvers: {msg}", file=sys.stderr, flush=True)
28
+
29
+
30
+ class Evolvable:
31
+ """A function whose body the optimizer rewrites to maximize criteria scores.
32
+
33
+ The function may take an `llm` parameter; if present, the bound LLM is auto-injected
34
+ on each call. Persistence is to a directory under EVOLVERS_CACHE (default ~/.cache/evolvers/).
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ fn: Callable[..., Any],
40
+ criteria: list[Criterion],
41
+ llm: LLM,
42
+ *,
43
+ _source: str | None = None,
44
+ _signature: inspect.Signature | None = None,
45
+ ):
46
+ self.llm = llm
47
+ self.criteria = list(criteria)
48
+ self._signature = _signature or inspect.signature(fn)
49
+ self._source = _source if _source is not None else _get_source(fn)
50
+ self._compiled = fn
51
+ self._best_source = self._source
52
+ self._best_score: float | None = None
53
+ self.history: list[dict[str, Any]] = []
54
+
55
+ @property
56
+ def source(self) -> str:
57
+ return self._best_source
58
+
59
+ def __call__(self, *args: Any, **kwargs: Any) -> Any:
60
+ if "llm" in self._signature.parameters and "llm" not in kwargs:
61
+ kwargs["llm"] = self.llm
62
+ return self._compiled(*args, **kwargs)
63
+
64
+ def set_llm(self, llm: LLM) -> Evolvable:
65
+ self.llm = llm
66
+ return self
67
+
68
+ def clone(self) -> Evolvable:
69
+ new = Evolvable.__new__(Evolvable)
70
+ new.llm = self.llm
71
+ new.criteria = copy.deepcopy(self.criteria)
72
+ new._signature = self._signature
73
+ new._source = self._source
74
+ new._best_source = self._best_source
75
+ new._best_score = self._best_score
76
+ new._compiled = _compile_fn(self._best_source)
77
+ new.history = []
78
+ return new
79
+
80
+ def evaluate(
81
+ self,
82
+ dataset: Iterable[Any],
83
+ *,
84
+ show_progress: bool = True,
85
+ max_workers: int = 8,
86
+ ) -> dict[str, Any]:
87
+ return self._run_eval(list(dataset), label="eval", show_progress=show_progress, max_workers=max_workers)
88
+
89
+ def train(
90
+ self,
91
+ dataset: Iterable[Any],
92
+ *,
93
+ budget: int = 20,
94
+ show_progress: bool = True,
95
+ max_workers: int = 8,
96
+ ) -> dict[str, Any]:
97
+ data = list(dataset)
98
+ _log(f"train: budget={budget}, dataset_size={len(data)}, criteria={[c.name for c in self.criteria]}")
99
+
100
+ _log("train: running baseline eval...")
101
+ baseline = self._run_eval(data, label="baseline", show_progress=show_progress, max_workers=max_workers)
102
+ self._best_score = baseline["aggregate"]
103
+ self._best_source = self._source
104
+ self.history.append(
105
+ {
106
+ "attempt": 0,
107
+ "source": self._source,
108
+ "score": baseline["aggregate"],
109
+ "per_criterion": baseline["per_criterion"],
110
+ "result": baseline,
111
+ "accepted": True,
112
+ "kind": "baseline",
113
+ }
114
+ )
115
+
116
+ iterator: Iterable[int] = range(1, budget + 1)
117
+ if show_progress:
118
+ iterator = tqdm(iterator, desc="evolve", total=budget)
119
+
120
+ for attempt in iterator:
121
+ entry: dict[str, Any] = {"attempt": attempt, "accepted": False}
122
+ _log(f"train: attempt {attempt}/{budget} — proposing mutation...")
123
+ t_propose = time.perf_counter()
124
+ try:
125
+ new_source = self._propose_mutation()
126
+ except Exception as e:
127
+ entry["error"] = f"propose failed: {type(e).__name__}: {e}"
128
+ _log(f"train: attempt {attempt} — {entry['error']}")
129
+ self.history.append(entry)
130
+ continue
131
+ propose_elapsed = time.perf_counter() - t_propose
132
+ _log(f"train: attempt {attempt} — mutation proposed ({propose_elapsed:.1f}s, {len(new_source)} chars)")
133
+
134
+ try:
135
+ new_fn = _compile_fn(new_source)
136
+ except Exception as e:
137
+ entry["source"] = new_source
138
+ entry["error"] = f"compile failed: {type(e).__name__}: {e}"
139
+ _log(f"train: attempt {attempt} — compile failed: {e}")
140
+ self.history.append(entry)
141
+ continue
142
+
143
+ prev_compiled = self._compiled
144
+ self._compiled = new_fn
145
+ _log(f"train: attempt {attempt} — evaluating new candidate...")
146
+ try:
147
+ result = self._run_eval(data, label=f"attempt {attempt}", show_progress=False, max_workers=max_workers)
148
+ entry["score"] = result["aggregate"]
149
+ entry["per_criterion"] = result["per_criterion"]
150
+ entry["source"] = new_source
151
+ entry["result"] = result
152
+ best = self._best_score if self._best_score is not None else float("-inf")
153
+ if result["aggregate"] > best:
154
+ self._best_score = result["aggregate"]
155
+ self._best_source = new_source
156
+ entry["accepted"] = True
157
+ _log(f"train: attempt {attempt} — ACCEPTED, new best aggregate={result['aggregate']:.3f}")
158
+ else:
159
+ self._compiled = prev_compiled
160
+ _log(
161
+ f"train: attempt {attempt} — REVERTED, "
162
+ f"aggregate={result['aggregate']:.3f} <= best={self._best_score:.3f}"
163
+ )
164
+ except Exception as e:
165
+ self._compiled = prev_compiled
166
+ entry["source"] = new_source
167
+ entry["error"] = f"eval failed: {type(e).__name__}: {e}"
168
+ entry["traceback"] = traceback.format_exc(limit=3)
169
+ _log(f"train: attempt {attempt} — eval crashed: {e}")
170
+
171
+ self.history.append(entry)
172
+
173
+ self._compiled = _compile_fn(self._best_source)
174
+ self._source = self._best_source
175
+ _log(f"train: done. best_score={self._best_score:.3f}")
176
+ return {
177
+ "best_score": self._best_score,
178
+ "best_source": self._best_source,
179
+ "history": self.history,
180
+ }
181
+
182
+ def save(self, uri: str) -> Path:
183
+ path = _cache_dir(uri)
184
+ path.mkdir(parents=True, exist_ok=True)
185
+
186
+ (path / "program.py").write_text(self._best_source)
187
+
188
+ crit_dir = path / "criteria"
189
+ crit_dir.mkdir(exist_ok=True)
190
+ criteria_meta: list[dict[str, Any]] = []
191
+ for c in self.criteria:
192
+ entry = {"name": c.name, "kind": c.kind, "weight": c.weight}
193
+ if c.kind == "judge":
194
+ (crit_dir / f"{c.name}.judge.txt").write_text(c.question or "")
195
+ else:
196
+ src = c.source_code or _fallback_code_source(c)
197
+ (crit_dir / f"{c.name}.code.py").write_text(src)
198
+ entry["source_available"] = c.source_code is not None
199
+ criteria_meta.append(entry)
200
+
201
+ traces_dir = path / "traces"
202
+ traces_dir.mkdir(exist_ok=True)
203
+ if self.history:
204
+ with (traces_dir / "train.jsonl").open("w") as f:
205
+ for h in self.history:
206
+ f.write(json.dumps(_jsonable(h)) + "\n")
207
+
208
+ manifest = {
209
+ "uri": uri,
210
+ "signature": str(self._signature),
211
+ "function_name": _extract_def_name(self._best_source),
212
+ "llm": {"model": self.llm.model, "provider": self.llm.provider, "base_url": self.llm.base_url},
213
+ "variant": uri.split(":", 1)[1] if ":" in uri else None,
214
+ "criteria": criteria_meta,
215
+ "best_score": self._best_score,
216
+ "saved_at": time.time(),
217
+ }
218
+ (path / "manifest.json").write_text(json.dumps(manifest, indent=2))
219
+ _log(f"saved to {path}")
220
+ return path
221
+
222
+ @classmethod
223
+ def load(cls, uri: str, *, llm: LLM | None = None) -> Evolvable:
224
+ path = _cache_dir(uri)
225
+ if not path.exists():
226
+ raise FileNotFoundError(f"No artifact at {path}")
227
+ manifest = json.loads((path / "manifest.json").read_text())
228
+
229
+ source = (path / "program.py").read_text()
230
+ fn = _compile_fn(source)
231
+
232
+ criteria: list[Criterion] = []
233
+ for cmeta in manifest["criteria"]:
234
+ name, kind, weight = cmeta["name"], cmeta["kind"], cmeta["weight"]
235
+ if kind == "judge":
236
+ q = (path / "criteria" / f"{name}.judge.txt").read_text()
237
+ criteria.append(Criterion(name=name, kind="judge", weight=weight, question=q))
238
+ else:
239
+ src = (path / "criteria" / f"{name}.code.py").read_text()
240
+ code_fn = _compile_fn(src)
241
+ criteria.append(Criterion(name=name, kind="code", weight=weight, fn=code_fn, source_code=src))
242
+
243
+ if llm is None:
244
+ llm_meta = manifest.get("llm", {})
245
+ llm = LLM(model=llm_meta.get("model", ""), base_url=llm_meta.get("base_url"))
246
+
247
+ instance = cls.__new__(cls)
248
+ instance.llm = llm
249
+ instance.criteria = criteria
250
+ instance._signature = inspect.signature(fn)
251
+ instance._source = source
252
+ instance._best_source = source
253
+ instance._best_score = manifest.get("best_score")
254
+ instance._compiled = fn
255
+ instance.history = []
256
+ _log(f"loaded {uri} (best_score={manifest.get('best_score')})")
257
+ return instance
258
+
259
+ def _run_one_trial(self, row: Any, idx: int, total: int, max_workers: int) -> dict[str, Any]:
260
+ program_input, call_args, call_kwargs = _row_to_call(row, self._signature)
261
+ _log(f" trial {idx + 1}/{total} starting (input={_truncate(repr(program_input), 80)})")
262
+ t0 = time.perf_counter()
263
+ try:
264
+ output = self(*call_args, **call_kwargs)
265
+ err = None
266
+ except Exception as e:
267
+ output = None
268
+ err = f"{type(e).__name__}: {e}"
269
+ program_latency_ms = (time.perf_counter() - t0) * 1000
270
+
271
+ per_criterion: dict[str, dict[str, Any]] = {}
272
+ if output is None:
273
+ for c in self.criteria:
274
+ per_criterion[c.name] = {"score": -1.0, "reasoning": f"program failed: {err}"}
275
+ else:
276
+ with ThreadPoolExecutor(max_workers=min(max_workers, max(1, len(self.criteria)))) as ex:
277
+ results = list(
278
+ ex.map(
279
+ lambda c: (c.name, evaluate_criterion(c, program_input, output, self.llm)),
280
+ self.criteria,
281
+ )
282
+ )
283
+ for name, (score, reasoning) in results:
284
+ per_criterion[name] = {"score": score, "reasoning": reasoning}
285
+
286
+ scores_summary = {k: round(v["score"], 2) for k, v in per_criterion.items()}
287
+ elapsed = time.perf_counter() - t0
288
+ _log(
289
+ f" trial {idx + 1}/{total} done ({elapsed:.1f}s, "
290
+ f"output={_truncate(repr(output), 80)}, scores={scores_summary})"
291
+ )
292
+ return {
293
+ "input": program_input,
294
+ "output": output,
295
+ "error": err,
296
+ "latency_ms": program_latency_ms,
297
+ "per_criterion": per_criterion,
298
+ }
299
+
300
+ def _run_eval(
301
+ self,
302
+ data: list[Any],
303
+ *,
304
+ label: str,
305
+ show_progress: bool,
306
+ max_workers: int,
307
+ ) -> dict[str, Any]:
308
+ n = len(data)
309
+ _log(f"eval [{label}]: starting on {n} rows (max_workers={max_workers})")
310
+ t0 = time.perf_counter()
311
+
312
+ trials_indexed: list[tuple[int, dict[str, Any]]] = []
313
+ if n == 1 or max_workers <= 1:
314
+ for idx, row in enumerate(data):
315
+ trials_indexed.append((idx, self._run_one_trial(row, idx, n, max_workers)))
316
+ else:
317
+ with ThreadPoolExecutor(max_workers=min(max_workers, n)) as ex:
318
+ futures = {
319
+ ex.submit(self._run_one_trial, row, idx, n, max_workers): idx for idx, row in enumerate(data)
320
+ }
321
+ for fut in futures:
322
+ idx = futures[fut]
323
+ trials_indexed.append((idx, fut.result()))
324
+
325
+ trials_indexed.sort(key=lambda x: x[0])
326
+ trials = [t for _, t in trials_indexed]
327
+
328
+ per_criterion_mean: dict[str, float] = {}
329
+ total_weight = sum(c.weight for c in self.criteria) or 1.0
330
+ for c in self.criteria:
331
+ scores = [t["per_criterion"][c.name]["score"] for t in trials]
332
+ per_criterion_mean[c.name] = sum(scores) / max(1, len(scores))
333
+ aggregate = sum(per_criterion_mean[c.name] * c.weight for c in self.criteria) / total_weight
334
+
335
+ elapsed = time.perf_counter() - t0
336
+ _log(
337
+ f"eval [{label}]: done in {elapsed:.1f}s — aggregate={aggregate:.3f} "
338
+ f"per_criterion={ {k: round(v, 3) for k, v in per_criterion_mean.items()} }"
339
+ )
340
+
341
+ return {
342
+ "aggregate": aggregate,
343
+ "per_criterion": per_criterion_mean,
344
+ "trials": trials,
345
+ "label": label,
346
+ }
347
+
348
+ def _propose_mutation(self) -> str:
349
+ recent = [h for h in self.history if "score" in h][-3:]
350
+ if not recent:
351
+ recent = [{"attempt": 0, "source": self._best_source, "score": self._best_score or 0.0}]
352
+
353
+ criteria_desc = "\n".join(
354
+ f"- {c.name} (weight={c.weight:.2f}, kind={c.kind}): "
355
+ + (c.question if c.kind == "judge" else (c.source_code or "<code>"))
356
+ for c in self.criteria
357
+ )
358
+
359
+ history_lines = []
360
+ for h in recent:
361
+ score = h.get("score") if h.get("score") is not None else 0.0
362
+ block = f"Attempt {h['attempt']} (score={score:.3f}):\n```python\n{h.get('source', '')}\n```"
363
+ history_lines.append(block)
364
+ history_block = "\n\n".join(history_lines)
365
+
366
+ last_trials_block = ""
367
+ for h in reversed(recent):
368
+ result = h.get("result") or {}
369
+ trials = result.get("trials", [])
370
+ if trials:
371
+ lines = []
372
+ for t in trials[:3]:
373
+ score_summary = {k: round(v["score"], 2) for k, v in t["per_criterion"].items()}
374
+ reasoning_summary = {
375
+ k: _truncate(v.get("reasoning", "") or "", 200) for k, v in t["per_criterion"].items()
376
+ }
377
+ lines.append(
378
+ f"- input: {_truncate(repr(t['input']), 200)}\n"
379
+ f" output: {_truncate(repr(t['output']), 400)}\n"
380
+ f" scores: {score_summary}\n"
381
+ f" judge_reasoning: {reasoning_summary}"
382
+ )
383
+ last_trials_block = "Sample trials from last evaluation:\n" + "\n".join(lines)
384
+ break
385
+
386
+ best_score = self._best_score if self._best_score is not None else 0.0
387
+ prompt = textwrap.dedent(f"""\
388
+ You are optimizing a Python function. Goal: maximize the weighted-mean criterion score (range [-1, 1]).
389
+
390
+ Current best implementation (score={best_score:.3f}):
391
+ ```python
392
+ {self._best_source}
393
+ ```
394
+
395
+ Criteria:
396
+ {criteria_desc}
397
+
398
+ Recent attempts:
399
+ {history_block}
400
+
401
+ {last_trials_block}
402
+
403
+ You may use the injected `llm` callable inside the function. Its signature:
404
+ llm(prompt: str, *, schema: type[BaseModel] | None = None,
405
+ system: str | None = None) -> str | BaseModel
406
+ llm.batch(prompts: list[str], **kwargs) -> list
407
+
408
+ IMPORTANT — token budgets:
409
+ - The injected `llm` is a 2026-era reasoning model. It uses substantial tokens for internal
410
+ thinking BEFORE producing visible content. Defaults are already tuned for this.
411
+ - DO NOT pass `max_tokens` to `llm()`. The default is set to a high value (32k+).
412
+ Passing a small `max_tokens` (e.g. 100, 500, 1024) will cause the reasoning preamble
413
+ to consume the entire budget, leaving content empty and the function returning ''.
414
+
415
+ Rules:
416
+ - Preserve the function signature: {self._signature}.
417
+ - Reply with ONLY the function source as a single Python code block (```python ... ```).
418
+ - Do not include explanations outside the code block.
419
+ """)
420
+
421
+ response = self.llm(
422
+ prompt,
423
+ system=(
424
+ "You are an expert Python programmer iteratively refining a function under criteria. "
425
+ "You target 2026-era reasoning LLMs and never artificially cap token budgets."
426
+ ),
427
+ )
428
+ if not isinstance(response, str):
429
+ response = str(response)
430
+ return _extract_python(response)
431
+
432
+ def __repr__(self) -> str:
433
+ name = _extract_def_name(self._best_source) or "<evolvable>"
434
+ return f"Evolvable({name}, criteria={[c.name for c in self.criteria]}, llm={self.llm.model})"
435
+
436
+
437
+ def _compile_fn(source: str) -> Callable[..., Any]:
438
+ ns: dict[str, Any] = {}
439
+ exec(compile(source, "<evolvable>", "exec"), ns)
440
+ fns = [v for v in ns.values() if inspect.isfunction(v)]
441
+ if not fns:
442
+ raise ValueError("no function found in source")
443
+ return fns[-1]
444
+
445
+
446
+ def _get_source(fn: Callable) -> str:
447
+ try:
448
+ return textwrap.dedent(inspect.getsource(fn))
449
+ except (OSError, TypeError) as e:
450
+ raise ValueError(f"cannot get source for {fn}: {e}") from e
451
+
452
+
453
+ def _extract_python(text: str) -> str:
454
+ m = re.search(r"```(?:python)?\s*\n(.*?)```", text, re.DOTALL)
455
+ if m:
456
+ return m.group(1).strip()
457
+ return text.strip()
458
+
459
+
460
+ def _extract_def_name(source: str) -> str | None:
461
+ m = re.search(r"^def\s+(\w+)\s*\(", source, re.MULTILINE)
462
+ return m.group(1) if m else None
463
+
464
+
465
+ def _row_to_call(row: Any, sig: inspect.Signature) -> tuple[Any, tuple, dict]:
466
+ params = [p for p in sig.parameters.values() if p.name != "llm"]
467
+ if isinstance(row, dict):
468
+ kwargs = {k: v for k, v in row.items() if k in {p.name for p in sig.parameters}}
469
+ first_param = params[0].name if params else None
470
+ program_input = kwargs.get(first_param) if first_param else row
471
+ return program_input, (), kwargs
472
+ return row, (row,), {}
473
+
474
+
475
+ def _cache_dir(uri: str) -> Path:
476
+ root = Path(os.environ.get("EVOLVERS_CACHE", "~/.cache/evolvers")).expanduser()
477
+ return root / uri
478
+
479
+
480
+ def _fallback_code_source(c: Criterion) -> str:
481
+ return f"# source not captured for {c.name!r}\ndef {c.name}(output):\n return 0.0\n"
482
+
483
+
484
+ def _jsonable(obj: Any) -> Any:
485
+ try:
486
+ json.dumps(obj)
487
+ return obj
488
+ except (TypeError, ValueError):
489
+ if isinstance(obj, dict):
490
+ return {k: _jsonable(v) for k, v in obj.items()}
491
+ if isinstance(obj, (list, tuple)):
492
+ return [_jsonable(v) for v in obj]
493
+ return repr(obj)
494
+
495
+
496
+ def _truncate(s: str, n: int) -> str:
497
+ return s if len(s) <= n else s[: n - 3] + "..."