claude-evolve 1.11.16 → 1.11.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -141,7 +141,37 @@ SCRIPT
141
141
  exec codex exec --dangerously-bypass-approvals-and-sandbox "$1"
142
142
  SCRIPT
143
143
  ;;
144
- qwen)
144
+ gpt-5-codex)
145
+ cat > "$test_script" << 'SCRIPT'
146
+ #!/usr/bin/env bash
147
+ exec codex exec -m gpt-5-codex --dangerously-bypass-approvals-and-sandbox "$1"
148
+ SCRIPT
149
+ ;;
150
+ gpt-5.2)
151
+ cat > "$test_script" << 'SCRIPT'
152
+ #!/usr/bin/env bash
153
+ exec codex exec -m gpt-5.2 --dangerously-bypass-approvals-and-sandbox "$1"
154
+ SCRIPT
155
+ ;;
156
+ gpt-5.3-codex)
157
+ cat > "$test_script" << 'SCRIPT'
158
+ #!/usr/bin/env bash
159
+ exec codex exec -m gpt-5.3-codex --dangerously-bypass-approvals-and-sandbox "$1"
160
+ SCRIPT
161
+ ;;
162
+ gpt-5.3-codex-spark)
163
+ cat > "$test_script" << 'SCRIPT'
164
+ #!/usr/bin/env bash
165
+ exec codex exec -m gpt-5.3-codex-spark --dangerously-bypass-approvals-and-sandbox "$1"
166
+ SCRIPT
167
+ ;;
168
+ gemini-5-flash)
169
+ cat > "$test_script" << 'SCRIPT'
170
+ #!/usr/bin/env bash
171
+ exec gemini -y -m gemini-5-flash -p "$1"
172
+ SCRIPT
173
+ ;;
174
+ qwen-openrouter)
145
175
  cat > "$test_script" << 'SCRIPT'
146
176
  #!/usr/bin/env bash
147
177
  exec opencode -m openrouter/qwen/qwen3.5-plus-02-15 run "$1"
package/lib/ai-cli.sh CHANGED
@@ -105,6 +105,30 @@ $prompt"
105
105
  ai_output=$(codex exec -m "$codex_gpt5_model" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
106
106
  local ai_exit_code=$?
107
107
  ;;
108
+ gpt-5-codex)
109
+ local ai_output
110
+ # GPT-5 Codex - code-specialized variant via Codex CLI
111
+ ai_output=$(codex exec -m gpt-5-codex --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
112
+ local ai_exit_code=$?
113
+ ;;
114
+ gpt-5.2)
115
+ local ai_output
116
+ # GPT-5.2 via Codex CLI
117
+ ai_output=$(codex exec -m gpt-5.2 --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
118
+ local ai_exit_code=$?
119
+ ;;
120
+ gpt-5.3-codex)
121
+ local ai_output
122
+ # GPT-5.3 Codex via Codex CLI
123
+ ai_output=$(codex exec -m gpt-5.3-codex --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
124
+ local ai_exit_code=$?
125
+ ;;
126
+ gpt-5.3-codex-spark)
127
+ local ai_output
128
+ # GPT-5.3 Codex Spark - lightweight fallback via Codex CLI
129
+ ai_output=$(codex exec -m gpt-5.3-codex-spark --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
130
+ local ai_exit_code=$?
131
+ ;;
108
132
  o3high)
109
133
  local ai_output
110
134
  ai_output=$(codex exec -m o3-mini -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
@@ -122,6 +146,12 @@ $prompt"
122
146
  ai_output=$(gemini -y -m gemini-2.5-flash -p "$prompt" 2>&1)
123
147
  local ai_exit_code=$?
124
148
  ;;
149
+ gemini-5-flash)
150
+ local ai_output
151
+ # Gemini 5 Flash - cheap fallback model
152
+ ai_output=$(gemini -y -m gemini-5-flash -p "$prompt" 2>&1)
153
+ local ai_exit_code=$?
154
+ ;;
125
155
  gemini-3-pro-preview)
126
156
  local ai_output
127
157
  # Gemini v3 Pro Preview via OpenRouter - EXPENSIVE
@@ -222,7 +252,7 @@ $prompt"
222
252
  ai_output=$(opencode -m openrouter/moonshotai/kimi-k2.5 run "$prompt" 2>&1)
223
253
  local ai_exit_code=$?
224
254
  ;;
225
- qwen)
255
+ qwen-openrouter)
226
256
  local ai_output
227
257
  # Qwen latest - Alibaba's flagship model (currently qwen3.5-plus)
228
258
  # Linear attention + sparse MoE, strong multimodal capabilities
@@ -258,8 +288,8 @@ $prompt"
258
288
  echo "[AI] Raw output from $model_name:" >&2
259
289
  echo "----------------------------------------" >&2
260
290
  if [[ ${#ai_output} -gt 2000 ]]; then
261
- echo "$ai_output" | head -50 >&2
262
- echo "... (truncated from ${#ai_output} characters to first 50 lines) ..." >&2
291
+ echo "... (truncated from ${#ai_output} characters to last 50 lines) ..." >&2
292
+ echo "$ai_output" | tail -50 >&2
263
293
  else
264
294
  echo "$ai_output" >&2
265
295
  fi
package/lib/ai_cli.py CHANGED
@@ -246,7 +246,10 @@ MODEL_TIMEOUTS = {
246
246
  'opus-think': 1800, 'sonnet-think': 1800,
247
247
  # Gemini - 30 min for pro (streams while working), 20 min for flash
248
248
  'gemini-pro': 1800, 'gemini-flash': 1200, 'gemini-3-flash': 600,
249
- 'gemini-3-pro-preview': 1800,
249
+ 'gemini-3-pro-preview': 1800, 'gemini-5-flash': 600,
250
+ # Codex/OpenAI models - 10 min standard
251
+ 'gpt-5-codex': 600, 'gpt-5.2': 600, 'gpt-5.3-codex': 600,
252
+ 'gpt-5.3-codex-spark': 300,
250
253
  # Z.AI agentic modes - 30 min (can be slow)
251
254
  'glm-zai': 1800, 'glm-5-zai': 1800,
252
255
  # Codex local - 40 min (local inference can be slow)
package/lib/config.sh CHANGED
@@ -63,13 +63,13 @@ DEFAULT_WORKER_MAX_CANDIDATES=3
63
63
  #
64
64
  # Run: Subscription-based agentic models for code generation
65
65
  # All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
66
- DEFAULT_LLM_RUN="sonnet glm-5-zai kimi-coder gemini-pro"
67
- DEFAULT_LLM_RUN_FALLBACK="codex-oss-local"
66
+ DEFAULT_LLM_RUN="gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder gpt-5-codex gpt-5-codex sonnet"
67
+ DEFAULT_LLM_RUN_FALLBACK="haiku glm-5-zai gemini-5-flash gpt-5.3-codex-spark"
68
68
  #
69
69
  # Ideate: Agentic models that can edit files for ideation
70
70
  # All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
71
- DEFAULT_LLM_IDEATE="opus-think sonnet-think glm-5-zai gemini-pro kimi-coder"
72
- DEFAULT_LLM_IDEATE_FALLBACK="sonnet glm-5-zai"
71
+ DEFAULT_LLM_IDEATE="opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2 gpt-5.3-codex qwen-openrouter"
72
+ DEFAULT_LLM_IDEATE_FALLBACK="haiku glm-5-zai gemini-5-flash gpt-5.3-codex-spark"
73
73
 
74
74
  # Load configuration from a YAML file and update variables
75
75
  _load_yaml_config() {
@@ -318,7 +318,7 @@ show_config() {
318
318
  echo " Worker max candidates: $WORKER_MAX_CANDIDATES"
319
319
  echo " LLM configuration:"
320
320
  # Show LLM configurations using dynamic variable names
321
- for model in gpt5high o3high codex gemini opus opus_think sonnet sonnet_think cursor_sonnet cursor_opus glm deepseek; do
321
+ for model in gpt5high o3high gpt_5_codex gpt_5_2 gpt_5_3_codex gpt_5_3_codex_spark codex gemini gemini_5_flash opus opus_think sonnet sonnet_think cursor_sonnet cursor_opus glm deepseek; do
322
322
  var_name="LLM_CLI_${model}"
323
323
  var_value=$(eval echo "\$$var_name")
324
324
  if [[ -n "$var_value" ]]; then
package/lib/evolve_run.py CHANGED
@@ -71,7 +71,11 @@ class WorkerPool:
71
71
  try:
72
72
  # Don't capture output - let it stream directly to terminal
73
73
  # This provides real-time visibility into which models are being used
74
- proc = subprocess.Popen(cmd)
74
+ # AIDEV-NOTE: Explicitly pass stdin=DEVNULL so workers don't inherit
75
+ # a closed/bad stdin FD from parent (e.g. when run via nohup or after
76
+ # terminal disconnect). Without this, Python workers crash at startup
77
+ # with "OSError: [Errno 9] Bad file descriptor" on sys stream init.
78
+ proc = subprocess.Popen(cmd, stdin=subprocess.DEVNULL)
75
79
  self.workers[proc.pid] = proc
76
80
  log(f"Spawned worker {proc.pid}")
77
81
  return proc.pid
@@ -205,9 +205,15 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
205
205
  preview = output[-300:] if output else "(empty)"
206
206
  log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying fallback...")
207
207
  log(f"AI output preview: {preview}")
208
+ # AIDEV-NOTE: Report no-modification as failure to bandit
209
+ self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
210
+ log(f"Bandit update: {selected_model} no file modification")
208
211
 
209
212
  except AIError as e:
210
213
  log(f"Bandit model {selected_model} failed: {e}, trying fallback...")
214
+ # AIDEV-NOTE: Report AI-level failure to bandit so it learns to avoid broken models
215
+ self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
216
+ log(f"Bandit update: {selected_model} AI call failed")
211
217
 
212
218
  # Fallback to round-based retry with all models
213
219
  try:
@@ -229,7 +235,7 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
229
235
  return True, model
230
236
  else:
231
237
  # AIDEV-NOTE: Log output so we can diagnose why file wasn't modified
232
- preview = output[:300] if output else "(empty)"
238
+ preview = output[-300:] if output else "(empty)"
233
239
  log(f"AI completed but did not modify file ({len(output)} chars)")
234
240
  log(f"AI output preview: {preview}")
235
241
  return False, model
package/lib/llm_bandit.py CHANGED
@@ -86,6 +86,12 @@ class LLMBandit:
86
86
  # Baseline score for normalizing improvements
87
87
  self._baseline_score: float = 0.0
88
88
 
89
+ # AIDEV-NOTE: Decay counter - only apply decay every N updates to prevent
90
+ # aggressive memory loss. With decay_factor=0.95 applied every update,
91
+ # n_completed floors at 1 after ~50 updates and the bandit can't learn.
92
+ self._updates_since_decay: int = 0
93
+ self._decay_interval: int = 50 # Apply decay every 50 updates
94
+
89
95
  # Load existing state if available
90
96
  if state_file and Path(state_file).exists():
91
97
  self.load()
@@ -215,13 +221,23 @@ class LLMBandit:
215
221
  return improvement
216
222
 
217
223
  def _apply_decay(self) -> None:
218
- """Apply decay to reduce influence of old observations."""
224
+ """Apply decay to reduce influence of old observations.
225
+
226
+ AIDEV-NOTE: Only applies every _decay_interval updates to prevent
227
+ aggressive memory loss. The int() truncation on n_completed was
228
+ destroying the bandit's ability to learn from failures.
229
+ """
230
+ self._updates_since_decay += 1
231
+ if self._updates_since_decay < self._decay_interval:
232
+ return
233
+
234
+ self._updates_since_decay = 0
219
235
  for stats in self.models.values():
220
- # Decay both counts and totals proportionally
236
+ # Decay totals to reduce influence of old observations
221
237
  stats.total_improvement *= self.decay_factor
222
- # Don't decay counts below a small floor to preserve some memory
223
- if stats.n_completed > 1:
224
- stats.n_completed = max(1, int(stats.n_completed * self.decay_factor))
238
+ # Decay counts but preserve enough memory to differentiate models
239
+ if stats.n_completed > 2:
240
+ stats.n_completed = max(2, int(stats.n_completed * self.decay_factor))
225
241
 
226
242
  def save(self) -> None:
227
243
  """Persist state to file."""
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-evolve",
3
- "version": "1.11.16",
3
+ "version": "1.11.18",
4
4
  "bin": {
5
5
  "claude-evolve": "bin/claude-evolve",
6
6
  "claude-evolve-main": "bin/claude-evolve-main",
@@ -90,30 +90,41 @@ parallel:
90
90
  llm_cli:
91
91
  # What to run for each sub-command
92
92
  # Models are tried in order, with round-robin distribution across candidates
93
- # You can repeat models for weighted selection (e.g., "sonnet sonnet gemini" for 2:1 ratio)
93
+ # You can repeat models for weighted selection (e.g., "gemini-pro gemini-pro sonnet" for 2:1 ratio)
94
94
 
95
- # Default configuration: 50/50 split between glm-zai and kimi-coder, commercial ideation
95
+ # Default configuration: sonnet at ~11%, rest doubled for cost savings
96
96
  # Commented out because these change over time; uncomment to override
97
- #run: glm-zai kimi-coder glm-zai kimi-coder glm-zai kimi-coder codex-oss-local
98
- #ideate: opus-openrouter kimi-k2-openrouter gemini-pro sonnet-think gpt5high grok-4-openrouter deepseek-openrouter glm-zai
97
+ #run: gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder gpt-5-codex gpt-5-codex sonnet
98
+ #ideate: opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2 gpt-5.3-codex
99
+ #run_fallback: haiku glm-5-zai gemini-5-flash gpt-5.3-codex-spark
100
+ #ideate_fallback: haiku glm-5-zai gemini-5-flash gpt-5.3-codex-spark
99
101
 
100
102
  # Available models:
101
- # - sonnet: Claude 3.5 Sonnet via Claude CLI
102
- # - sonnet-think: Claude 3.5 Sonnet with extended thinking (ultrathink prompt)
103
- # - opus: Claude 3 Opus via Claude CLI
104
- # - opus-think: Claude 3 Opus with extended thinking (ultrathink prompt)
105
- # - gemini: Gemini via Gemini CLI
106
- # - gpt5: GPT-5 via Codex CLI (standard)
103
+ # Claude (subscription-based, watch usage limits):
104
+ # - sonnet: Claude Sonnet via Claude CLI
105
+ # - sonnet-think: Claude Sonnet with extended thinking (ultrathink prompt)
106
+ # - opus: Claude Opus via Claude CLI
107
+ # - opus-think: Claude Opus with extended thinking (ultrathink prompt)
108
+ # - haiku: Claude Haiku via Claude CLI (cheap fallback)
109
+ #
110
+ # Codex/OpenAI (subscription-based):
111
+ # - gpt-5-codex: GPT-5 Codex (code-specialized) via Codex CLI
112
+ # - gpt-5.2: GPT-5.2 via Codex CLI
113
+ # - gpt-5.3-codex: GPT-5.3 Codex via Codex CLI
114
+ # - gpt-5.3-codex-spark: GPT-5.3 Codex Spark (lightweight fallback) via Codex CLI
115
+ # - gpt5: GPT-5 via Codex CLI (legacy alias)
107
116
  # - gpt5high: GPT-5 via Codex CLI (high reasoning)
108
117
  # - o3high: O3 via Codex CLI (high reasoning)
109
- # - cursor-sonnet: Claude 3.5 Sonnet via Cursor Agent CLI
110
- # - cursor-opus: Claude 3 Opus via Cursor Agent CLI
111
- # - glm: GLM-4.6 via OpenCode CLI
112
- # - grok-code-fast: Grok Code Fast 1 via OpenRouter
113
- # - grok-4: Grok 4 via OpenRouter
114
- # - opus-openrouter: Claude Opus 4.1 via OpenRouter
115
- # - kimi-k2-openrouter: Kimi K2 Thinking via OpenRouter (RECOMMENDED - no separate auth)
116
- # - kimi-k2-think-moonshot: Kimi K2 Thinking via Moonshot CLI (requires separate kimi CLI setup)
117
- # - kimi-coder: Kimi for Coding model via kimi CLI (fast, good for code generation)
118
- # - codex-qwen3: Qwen3-Coder via Codex + Ollama (local, free, RECOMMENDED)
119
- # - aider-qwen3: Qwen3-Coder via Aider + Ollama (local, free, experimental)
118
+ #
119
+ # Gemini (free tier available):
120
+ # - gemini-pro: Gemini 3 Pro Preview via Gemini CLI
121
+ # - gemini-5-flash: Gemini 5 Flash via Gemini CLI (cheap fallback)
122
+ # - gemini-flash: Gemini 2.5 Flash via Gemini CLI (legacy)
123
+ #
124
+ # Other free/cheap models:
125
+ # - glm-5-zai: GLM-5 via Z.AI agentic mode
126
+ # - kimi-coder: Kimi for Coding via kimi CLI (fast, good for code gen)
127
+ # - kimi-k2-openrouter: Kimi K2 Thinking via OpenRouter
128
+ # - codex-oss-local: Local model via Codex + Ollama
129
+ # - cursor-sonnet: Claude Sonnet via Cursor Agent CLI
130
+ # - cursor-opus: Claude Opus via Cursor Agent CLI
Binary file