claude-evolve 1.11.17 → 1.11.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/claude-evolve-check +20 -8
- package/lib/ai-cli.sh +26 -14
- package/lib/ai_cli.py +3 -2
- package/lib/config.sh +4 -4
- package/lib/evolve_run.py +5 -1
- package/lib/evolve_worker.py +7 -1
- package/lib/llm_bandit.py +21 -5
- package/package.json +1 -1
- package/templates/config.yaml +9 -7
package/bin/claude-evolve-check
CHANGED
|
@@ -90,7 +90,7 @@ SCRIPT
|
|
|
90
90
|
gemini-pro)
|
|
91
91
|
cat > "$test_script" << 'SCRIPT'
|
|
92
92
|
#!/usr/bin/env bash
|
|
93
|
-
exec gemini -y -m gemini-3
|
|
93
|
+
exec gemini -y -m auto-gemini-3 -p "$1"
|
|
94
94
|
SCRIPT
|
|
95
95
|
;;
|
|
96
96
|
gemini-flash|gemini-3-flash)
|
|
@@ -141,10 +141,22 @@ SCRIPT
|
|
|
141
141
|
exec codex exec --dangerously-bypass-approvals-and-sandbox "$1"
|
|
142
142
|
SCRIPT
|
|
143
143
|
;;
|
|
144
|
-
|
|
144
|
+
codex-think)
|
|
145
145
|
cat > "$test_script" << 'SCRIPT'
|
|
146
146
|
#!/usr/bin/env bash
|
|
147
|
-
exec codex exec -m gpt-5-
|
|
147
|
+
exec codex exec -m gpt-5.4 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$1"
|
|
148
|
+
SCRIPT
|
|
149
|
+
;;
|
|
150
|
+
codex-coding)
|
|
151
|
+
cat > "$test_script" << 'SCRIPT'
|
|
152
|
+
#!/usr/bin/env bash
|
|
153
|
+
exec codex exec -m gpt-5.4 -c model_reasoning_effort="medium" --dangerously-bypass-approvals-and-sandbox "$1"
|
|
154
|
+
SCRIPT
|
|
155
|
+
;;
|
|
156
|
+
gpt-5.4)
|
|
157
|
+
cat > "$test_script" << 'SCRIPT'
|
|
158
|
+
#!/usr/bin/env bash
|
|
159
|
+
exec codex exec -m gpt-5.4 --dangerously-bypass-approvals-and-sandbox "$1"
|
|
148
160
|
SCRIPT
|
|
149
161
|
;;
|
|
150
162
|
gpt-5.2)
|
|
@@ -159,22 +171,22 @@ SCRIPT
|
|
|
159
171
|
exec codex exec -m gpt-5.3-codex --dangerously-bypass-approvals-and-sandbox "$1"
|
|
160
172
|
SCRIPT
|
|
161
173
|
;;
|
|
162
|
-
gpt-5.
|
|
174
|
+
codex-spark|gpt-5.1-codex-mini)
|
|
163
175
|
cat > "$test_script" << 'SCRIPT'
|
|
164
176
|
#!/usr/bin/env bash
|
|
165
|
-
exec codex exec -m gpt-5.
|
|
177
|
+
exec codex exec -m gpt-5.1-codex-mini --dangerously-bypass-approvals-and-sandbox "$1"
|
|
166
178
|
SCRIPT
|
|
167
179
|
;;
|
|
168
180
|
gemini-5-flash)
|
|
169
181
|
cat > "$test_script" << 'SCRIPT'
|
|
170
182
|
#!/usr/bin/env bash
|
|
171
|
-
exec gemini -y -m gemini-
|
|
183
|
+
exec gemini -y -m gemini-3-flash-preview -p "$1"
|
|
172
184
|
SCRIPT
|
|
173
185
|
;;
|
|
174
|
-
qwen)
|
|
186
|
+
qwen-openrouter)
|
|
175
187
|
cat > "$test_script" << 'SCRIPT'
|
|
176
188
|
#!/usr/bin/env bash
|
|
177
|
-
exec opencode -m openrouter/qwen/qwen3.
|
|
189
|
+
exec opencode -m openrouter/qwen/qwen3.6-plus:free run "$1"
|
|
178
190
|
SCRIPT
|
|
179
191
|
;;
|
|
180
192
|
*)
|
package/lib/ai-cli.sh
CHANGED
|
@@ -105,10 +105,22 @@ $prompt"
|
|
|
105
105
|
ai_output=$(codex exec -m "$codex_gpt5_model" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
106
106
|
local ai_exit_code=$?
|
|
107
107
|
;;
|
|
108
|
-
|
|
108
|
+
codex-think)
|
|
109
109
|
local ai_output
|
|
110
|
-
# GPT-5
|
|
111
|
-
ai_output=$(codex exec -m gpt-5-
|
|
110
|
+
# GPT-5.4 high reasoning - for ideation tasks requiring deep thinking
|
|
111
|
+
ai_output=$(codex exec -m gpt-5.4 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
112
|
+
local ai_exit_code=$?
|
|
113
|
+
;;
|
|
114
|
+
codex-coding)
|
|
115
|
+
local ai_output
|
|
116
|
+
# GPT-5.4 medium reasoning - for coding/implementation tasks
|
|
117
|
+
ai_output=$(codex exec -m gpt-5.4 -c model_reasoning_effort="medium" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
118
|
+
local ai_exit_code=$?
|
|
119
|
+
;;
|
|
120
|
+
gpt-5.4)
|
|
121
|
+
local ai_output
|
|
122
|
+
# GPT-5.4 - latest frontier agentic coding model via Codex CLI
|
|
123
|
+
ai_output=$(codex exec -m gpt-5.4 --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
112
124
|
local ai_exit_code=$?
|
|
113
125
|
;;
|
|
114
126
|
gpt-5.2)
|
|
@@ -123,10 +135,10 @@ $prompt"
|
|
|
123
135
|
ai_output=$(codex exec -m gpt-5.3-codex --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
124
136
|
local ai_exit_code=$?
|
|
125
137
|
;;
|
|
126
|
-
gpt-5.
|
|
138
|
+
codex-spark|gpt-5.1-codex-mini)
|
|
127
139
|
local ai_output
|
|
128
|
-
# GPT-5.
|
|
129
|
-
ai_output=$(codex exec -m gpt-5.
|
|
140
|
+
# GPT-5.1 Codex Mini - cheap/fast lightweight fallback via Codex CLI
|
|
141
|
+
ai_output=$(codex exec -m gpt-5.1-codex-mini --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
130
142
|
local ai_exit_code=$?
|
|
131
143
|
;;
|
|
132
144
|
o3high)
|
|
@@ -136,8 +148,8 @@ $prompt"
|
|
|
136
148
|
;;
|
|
137
149
|
gemini-pro)
|
|
138
150
|
local ai_output
|
|
139
|
-
# Gemini streams output while working
|
|
140
|
-
ai_output=$(gemini -y -m gemini-3
|
|
151
|
+
# Gemini 3 auto-routing (gemini-3.1-pro / gemini-3-flash) - streams output while working
|
|
152
|
+
ai_output=$(gemini -y -m auto-gemini-3 -p "$prompt" 2>&1)
|
|
141
153
|
local ai_exit_code=$?
|
|
142
154
|
;;
|
|
143
155
|
gemini-flash)
|
|
@@ -149,7 +161,7 @@ $prompt"
|
|
|
149
161
|
gemini-5-flash)
|
|
150
162
|
local ai_output
|
|
151
163
|
# Gemini 5 Flash - cheap fallback model
|
|
152
|
-
ai_output=$(gemini -y -m gemini-
|
|
164
|
+
ai_output=$(gemini -y -m gemini-3-flash-preview -p "$prompt" 2>&1)
|
|
153
165
|
local ai_exit_code=$?
|
|
154
166
|
;;
|
|
155
167
|
gemini-3-pro-preview)
|
|
@@ -252,11 +264,11 @@ $prompt"
|
|
|
252
264
|
ai_output=$(opencode -m openrouter/moonshotai/kimi-k2.5 run "$prompt" 2>&1)
|
|
253
265
|
local ai_exit_code=$?
|
|
254
266
|
;;
|
|
255
|
-
qwen)
|
|
267
|
+
qwen-openrouter)
|
|
256
268
|
local ai_output
|
|
257
|
-
# Qwen latest - Alibaba's flagship model (currently qwen3.
|
|
269
|
+
# Qwen latest - Alibaba's flagship model (currently qwen3.6-plus, free promotional tier)
|
|
258
270
|
# Linear attention + sparse MoE, strong multimodal capabilities
|
|
259
|
-
ai_output=$(opencode -m openrouter/qwen/qwen3.
|
|
271
|
+
ai_output=$(opencode -m openrouter/qwen/qwen3.6-plus:free run "$prompt" 2>&1)
|
|
260
272
|
local ai_exit_code=$?
|
|
261
273
|
;;
|
|
262
274
|
codex-oss-local)
|
|
@@ -288,8 +300,8 @@ $prompt"
|
|
|
288
300
|
echo "[AI] Raw output from $model_name:" >&2
|
|
289
301
|
echo "----------------------------------------" >&2
|
|
290
302
|
if [[ ${#ai_output} -gt 2000 ]]; then
|
|
291
|
-
echo "$ai_output
|
|
292
|
-
echo "
|
|
303
|
+
echo "... (truncated from ${#ai_output} characters to last 50 lines) ..." >&2
|
|
304
|
+
echo "$ai_output" | tail -50 >&2
|
|
293
305
|
else
|
|
294
306
|
echo "$ai_output" >&2
|
|
295
307
|
fi
|
package/lib/ai_cli.py
CHANGED
|
@@ -248,8 +248,9 @@ MODEL_TIMEOUTS = {
|
|
|
248
248
|
'gemini-pro': 1800, 'gemini-flash': 1200, 'gemini-3-flash': 600,
|
|
249
249
|
'gemini-3-pro-preview': 1800, 'gemini-5-flash': 600,
|
|
250
250
|
# Codex/OpenAI models - 10 min standard
|
|
251
|
-
'
|
|
252
|
-
'gpt-5.3-codex
|
|
251
|
+
'codex-think': 900, 'codex-coding': 600, 'codex-spark': 300,
|
|
252
|
+
'gpt-5.4': 600, 'gpt-5.2': 600, 'gpt-5.3-codex': 600,
|
|
253
|
+
'gpt-5.1-codex-mini': 300,
|
|
253
254
|
# Z.AI agentic modes - 30 min (can be slow)
|
|
254
255
|
'glm-zai': 1800, 'glm-5-zai': 1800,
|
|
255
256
|
# Codex local - 40 min (local inference can be slow)
|
package/lib/config.sh
CHANGED
|
@@ -63,13 +63,13 @@ DEFAULT_WORKER_MAX_CANDIDATES=3
|
|
|
63
63
|
#
|
|
64
64
|
# Run: Subscription-based agentic models for code generation
|
|
65
65
|
# All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
|
|
66
|
-
DEFAULT_LLM_RUN="gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder
|
|
67
|
-
DEFAULT_LLM_RUN_FALLBACK="haiku glm-5-zai gemini-5-flash
|
|
66
|
+
DEFAULT_LLM_RUN="gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder codex-coding codex-coding sonnet"
|
|
67
|
+
DEFAULT_LLM_RUN_FALLBACK="haiku glm-5-zai gemini-5-flash codex-spark"
|
|
68
68
|
#
|
|
69
69
|
# Ideate: Agentic models that can edit files for ideation
|
|
70
70
|
# All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
|
|
71
|
-
DEFAULT_LLM_IDEATE="opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2
|
|
72
|
-
DEFAULT_LLM_IDEATE_FALLBACK="haiku glm-5-zai gemini-5-flash
|
|
71
|
+
DEFAULT_LLM_IDEATE="opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2 codex-think qwen-openrouter"
|
|
72
|
+
DEFAULT_LLM_IDEATE_FALLBACK="haiku glm-5-zai gemini-5-flash codex-spark"
|
|
73
73
|
|
|
74
74
|
# Load configuration from a YAML file and update variables
|
|
75
75
|
_load_yaml_config() {
|
package/lib/evolve_run.py
CHANGED
|
@@ -71,7 +71,11 @@ class WorkerPool:
|
|
|
71
71
|
try:
|
|
72
72
|
# Don't capture output - let it stream directly to terminal
|
|
73
73
|
# This provides real-time visibility into which models are being used
|
|
74
|
-
|
|
74
|
+
# AIDEV-NOTE: Explicitly pass stdin=DEVNULL so workers don't inherit
|
|
75
|
+
# a closed/bad stdin FD from parent (e.g. when run via nohup or after
|
|
76
|
+
# terminal disconnect). Without this, Python workers crash at startup
|
|
77
|
+
# with "OSError: [Errno 9] Bad file descriptor" on sys stream init.
|
|
78
|
+
proc = subprocess.Popen(cmd, stdin=subprocess.DEVNULL)
|
|
75
79
|
self.workers[proc.pid] = proc
|
|
76
80
|
log(f"Spawned worker {proc.pid}")
|
|
77
81
|
return proc.pid
|
package/lib/evolve_worker.py
CHANGED
|
@@ -205,9 +205,15 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
|
|
|
205
205
|
preview = output[-300:] if output else "(empty)"
|
|
206
206
|
log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying fallback...")
|
|
207
207
|
log(f"AI output preview: {preview}")
|
|
208
|
+
# AIDEV-NOTE: Report no-modification as failure to bandit
|
|
209
|
+
self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
|
|
210
|
+
log(f"Bandit update: {selected_model} no file modification")
|
|
208
211
|
|
|
209
212
|
except AIError as e:
|
|
210
213
|
log(f"Bandit model {selected_model} failed: {e}, trying fallback...")
|
|
214
|
+
# AIDEV-NOTE: Report AI-level failure to bandit so it learns to avoid broken models
|
|
215
|
+
self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
|
|
216
|
+
log(f"Bandit update: {selected_model} AI call failed")
|
|
211
217
|
|
|
212
218
|
# Fallback to round-based retry with all models
|
|
213
219
|
try:
|
|
@@ -229,7 +235,7 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
|
|
|
229
235
|
return True, model
|
|
230
236
|
else:
|
|
231
237
|
# AIDEV-NOTE: Log output so we can diagnose why file wasn't modified
|
|
232
|
-
preview = output[:
|
|
238
|
+
preview = output[-300:] if output else "(empty)"
|
|
233
239
|
log(f"AI completed but did not modify file ({len(output)} chars)")
|
|
234
240
|
log(f"AI output preview: {preview}")
|
|
235
241
|
return False, model
|
package/lib/llm_bandit.py
CHANGED
|
@@ -86,6 +86,12 @@ class LLMBandit:
|
|
|
86
86
|
# Baseline score for normalizing improvements
|
|
87
87
|
self._baseline_score: float = 0.0
|
|
88
88
|
|
|
89
|
+
# AIDEV-NOTE: Decay counter - only apply decay every N updates to prevent
|
|
90
|
+
# aggressive memory loss. With decay_factor=0.95 applied every update,
|
|
91
|
+
# n_completed floors at 1 after ~50 updates and the bandit can't learn.
|
|
92
|
+
self._updates_since_decay: int = 0
|
|
93
|
+
self._decay_interval: int = 50 # Apply decay every 50 updates
|
|
94
|
+
|
|
89
95
|
# Load existing state if available
|
|
90
96
|
if state_file and Path(state_file).exists():
|
|
91
97
|
self.load()
|
|
@@ -215,13 +221,23 @@ class LLMBandit:
|
|
|
215
221
|
return improvement
|
|
216
222
|
|
|
217
223
|
def _apply_decay(self) -> None:
|
|
218
|
-
"""Apply decay to reduce influence of old observations.
|
|
224
|
+
"""Apply decay to reduce influence of old observations.
|
|
225
|
+
|
|
226
|
+
AIDEV-NOTE: Only applies every _decay_interval updates to prevent
|
|
227
|
+
aggressive memory loss. The int() truncation on n_completed was
|
|
228
|
+
destroying the bandit's ability to learn from failures.
|
|
229
|
+
"""
|
|
230
|
+
self._updates_since_decay += 1
|
|
231
|
+
if self._updates_since_decay < self._decay_interval:
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
self._updates_since_decay = 0
|
|
219
235
|
for stats in self.models.values():
|
|
220
|
-
# Decay
|
|
236
|
+
# Decay totals to reduce influence of old observations
|
|
221
237
|
stats.total_improvement *= self.decay_factor
|
|
222
|
-
#
|
|
223
|
-
if stats.n_completed >
|
|
224
|
-
stats.n_completed = max(
|
|
238
|
+
# Decay counts but preserve enough memory to differentiate models
|
|
239
|
+
if stats.n_completed > 2:
|
|
240
|
+
stats.n_completed = max(2, int(stats.n_completed * self.decay_factor))
|
|
225
241
|
|
|
226
242
|
def save(self) -> None:
|
|
227
243
|
"""Persist state to file."""
|
package/package.json
CHANGED
package/templates/config.yaml
CHANGED
|
@@ -94,10 +94,10 @@ llm_cli:
|
|
|
94
94
|
|
|
95
95
|
# Default configuration: sonnet at ~11%, rest doubled for cost savings
|
|
96
96
|
# Commented out because these change over time; uncomment to override
|
|
97
|
-
#run: gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder
|
|
98
|
-
#ideate: opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2
|
|
99
|
-
#run_fallback: haiku glm-5-zai gemini-5-flash
|
|
100
|
-
#ideate_fallback: haiku glm-5-zai gemini-5-flash
|
|
97
|
+
#run: gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder codex-coding codex-coding sonnet
|
|
98
|
+
#ideate: opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2 codex-think qwen-openrouter
|
|
99
|
+
#run_fallback: haiku glm-5-zai gemini-5-flash codex-spark
|
|
100
|
+
#ideate_fallback: haiku glm-5-zai gemini-5-flash codex-spark
|
|
101
101
|
|
|
102
102
|
# Available models:
|
|
103
103
|
# Claude (subscription-based, watch usage limits):
|
|
@@ -108,10 +108,12 @@ llm_cli:
|
|
|
108
108
|
# - haiku: Claude Haiku via Claude CLI (cheap fallback)
|
|
109
109
|
#
|
|
110
110
|
# Codex/OpenAI (subscription-based):
|
|
111
|
-
# -
|
|
111
|
+
# - codex-think: GPT-5.4 high reasoning effort (ideation)
|
|
112
|
+
# - codex-coding: GPT-5.4 medium reasoning effort (coding/run)
|
|
113
|
+
# - codex-spark: GPT-5.1 Codex Mini (lightweight fallback)
|
|
114
|
+
# - gpt-5.4: GPT-5.4 no reasoning effort override via Codex CLI
|
|
112
115
|
# - gpt-5.2: GPT-5.2 via Codex CLI
|
|
113
|
-
# - gpt-5.3-codex: GPT-5.3 Codex via Codex CLI
|
|
114
|
-
# - gpt-5.3-codex-spark: GPT-5.3 Codex Spark (lightweight fallback) via Codex CLI
|
|
116
|
+
# - gpt-5.3-codex: GPT-5.3 Codex (code-specialized) via Codex CLI
|
|
115
117
|
# - gpt5: GPT-5 via Codex CLI (legacy alias)
|
|
116
118
|
# - gpt5high: GPT-5 via Codex CLI (high reasoning)
|
|
117
119
|
# - o3high: O3 via Codex CLI (high reasoning)
|