claude-evolve 1.12.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +0 -0
  2. package/lib/__pycache__/ai_cli.cpython-311.pyc +0 -0
  3. package/lib/__pycache__/ai_cli.cpython-314.pyc +0 -0
  4. package/lib/__pycache__/embedding.cpython-314.pyc +0 -0
  5. package/lib/__pycache__/evolution_csv.cpython-311.pyc +0 -0
  6. package/lib/__pycache__/evolution_csv.cpython-313.pyc +0 -0
  7. package/lib/__pycache__/evolution_csv.cpython-314.pyc +0 -0
  8. package/lib/__pycache__/evolve_ideate.cpython-314.pyc +0 -0
  9. package/lib/__pycache__/evolve_run.cpython-311.pyc +0 -0
  10. package/lib/__pycache__/evolve_run.cpython-314.pyc +0 -0
  11. package/lib/__pycache__/evolve_worker.cpython-314.pyc +0 -0
  12. package/lib/__pycache__/llm_bandit.cpython-314.pyc +0 -0
  13. package/lib/__pycache__/log.cpython-311.pyc +0 -0
  14. package/lib/__pycache__/log.cpython-314.pyc +0 -0
  15. package/lib/__pycache__/meta_learning.cpython-314.pyc +0 -0
  16. package/lib/__pycache__/sandbox_wrapper.cpython-314.pyc +0 -0
  17. package/lib/ai-cli.sh +14 -12
  18. package/lib/ai_cli.py +64 -34
  19. package/lib/config.py +0 -0
  20. package/lib/config.sh +24 -18
  21. package/lib/csv-lock.sh +0 -0
  22. package/lib/editor.sh +0 -0
  23. package/lib/evolution_csv.py +0 -0
  24. package/lib/evolution_processor.py +0 -0
  25. package/lib/evolve_ideate.py +0 -0
  26. package/lib/evolve_worker.py +114 -43
  27. package/lib/llm_bandit.py +0 -0
  28. package/lib/log.py +0 -0
  29. package/lib/meta_learning.py +0 -0
  30. package/lib/sandbox.sb +0 -0
  31. package/lib/sandbox_wrapper.py +0 -0
  32. package/package.json +1 -1
  33. package/templates/BRIEF.md +0 -0
  34. package/templates/algorithm.py +0 -0
  35. package/templates/config.yaml +39 -37
  36. package/templates/evaluator.py +0 -0
  37. package/lib/__pycache__/ai_cli.cpython-310.pyc +0 -0
  38. package/lib/__pycache__/embedding.cpython-310.pyc +0 -0
  39. package/lib/__pycache__/evolution_csv.cpython-310.pyc +0 -0
  40. package/lib/__pycache__/evolve_ideate.cpython-310.pyc +0 -0
  41. package/lib/__pycache__/log.cpython-310.pyc +0 -0
package/README.md CHANGED
File without changes
package/lib/ai-cli.sh CHANGED
@@ -53,7 +53,7 @@ EOF
53
53
  call_ai_model_configured() {
54
54
  local model_name="$1"
55
55
  local prompt="$2"
56
- local codex_gpt_model="${CODEX_GPT_MODEL:-${CODEX_GPT5_MODEL:-gpt-5.2}}"
56
+ local codex_gpt_model="${CODEX_GPT_MODEL:-${CODEX_GPT5_MODEL:-gpt-5.5}}"
57
57
 
58
58
  # Record start time
59
59
  local start_time=$(date +%s)
@@ -96,12 +96,12 @@ $prompt"
96
96
  ;;
97
97
  opus-openrouter)
98
98
  local ai_output
99
- ai_output=$(opencode -m openrouter/anthropic/claude-opus-4.1 run "$prompt" 2>&1)
99
+ ai_output=$(opencode -m openrouter/anthropic/claude-opus-4.7 run "$prompt" 2>&1)
100
100
  local ai_exit_code=$?
101
101
  ;;
102
102
  cursor-sonnet)
103
103
  local ai_output
104
- ai_output=$(cursor-agent sonnet-4.5 -p "$prompt" 2>&1)
104
+ ai_output=$(cursor-agent sonnet-4.6 -p "$prompt" 2>&1)
105
105
  local ai_exit_code=$?
106
106
  ;;
107
107
  cursor-opus)
@@ -123,7 +123,7 @@ $prompt"
123
123
  codex-think)
124
124
  local ai_output
125
125
  # High reasoning - for ideation tasks requiring deep thinking
126
- ai_output=$(codex exec -m gpt-5.4 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
126
+ ai_output=$(codex exec -m gpt-5.5 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
127
127
  local ai_exit_code=$?
128
128
  ;;
129
129
  codex-coding)
@@ -135,7 +135,7 @@ $prompt"
135
135
  codex-spark)
136
136
  local ai_output
137
137
  # Cheap/fast lightweight fallback
138
- ai_output=$(codex exec -m gpt-5.1-codex-mini --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
138
+ ai_output=$(codex exec -m gpt-5.4-mini --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
139
139
  local ai_exit_code=$?
140
140
  ;;
141
141
  # --- Gemini (subscription) ---
@@ -258,7 +258,7 @@ $prompt"
258
258
  ;;
259
259
  ollama-qwen)
260
260
  local ai_output
261
- ai_output=$(codex exec --dangerously-bypass-approvals-and-sandbox --skip-git-repo-check --oss --local-provider=ollama -m qwen3.5:cloud "$prompt" 2>&1)
261
+ ai_output=$(codex exec --dangerously-bypass-approvals-and-sandbox --skip-git-repo-check --oss --local-provider=ollama -m qwen3.6:cloud "$prompt" 2>&1)
262
262
  local ai_exit_code=$?
263
263
  ;;
264
264
  # --- Local inference ---
@@ -365,19 +365,21 @@ get_models_for_command() {
365
365
  echo "$model_list"
366
366
  }
367
367
 
368
- # Get fallback models for a specific command (run or ideate)
369
- # Usage: get_fallback_models_for_command <command>
370
- # Returns: Space-separated list of fallback model names
371
- get_fallback_models_for_command() {
368
+ # Get escalation models for a specific command (run or ideate)
369
+ # AIDEV-NOTE: Escalation models are big/commercial models used only when
370
+ # cheap primary models produce code with syntax or validation errors.
371
+ # Usage: get_escalation_models_for_command <command>
372
+ # Returns: Space-separated list of escalation model names
373
+ get_escalation_models_for_command() {
372
374
  local command="$1"
373
375
  local model_list=""
374
376
 
375
377
  case "$command" in
376
378
  run)
377
- model_list="$LLM_RUN_FALLBACK"
379
+ model_list="$LLM_RUN_ESCALATION"
378
380
  ;;
379
381
  ideate)
380
- model_list="$LLM_IDEATE_FALLBACK"
382
+ model_list="$LLM_IDEATE_ESCALATION"
381
383
  ;;
382
384
  *)
383
385
  echo "[ERROR] Unknown command: $command" >&2
package/lib/ai_cli.py CHANGED
@@ -202,22 +202,26 @@ def get_models_for_command(command: str) -> List[str]:
202
202
  return model_list.split()
203
203
 
204
204
 
205
- def get_fallback_models_for_command(command: str) -> List[str]:
205
+ def get_escalation_models_for_command(command: str) -> List[str]:
206
206
  """
207
- Get the list of fallback models for a command.
207
+ Get the list of escalation models for a command.
208
+
209
+ AIDEV-NOTE: Escalation models are big/commercial models (sonnet, codex, gemini)
210
+ used only when cheap primary models produce code with syntax or validation errors.
211
+ They are NOT for API-down situations — if primary is down, fail fast.
208
212
 
209
213
  Args:
210
214
  command: Either "run" or "ideate"
211
215
 
212
216
  Returns:
213
- List of fallback model names
217
+ List of escalation model names
214
218
  """
215
219
  bash_script = f'''
216
220
  source "{SCRIPT_DIR}/config.sh"
217
221
  load_config
218
222
  case "$1" in
219
- run) echo "$LLM_RUN_FALLBACK" ;;
220
- ideate) echo "$LLM_IDEATE_FALLBACK" ;;
223
+ run) echo "$LLM_RUN_ESCALATION" ;;
224
+ ideate) echo "$LLM_IDEATE_ESCALATION" ;;
221
225
  esac
222
226
  '''
223
227
 
@@ -423,34 +427,30 @@ def call_ai_with_backoff(
423
427
  env_vars: Optional[dict] = None,
424
428
  max_rounds: int = 10,
425
429
  initial_wait: int = 60,
426
- max_wait: int = 600,
427
- use_fallback: bool = True
430
+ max_wait: int = 600
428
431
  ) -> Tuple[str, str]:
429
432
  """
430
- Call AI with tiered fallback and round-based retries with exponential backoff.
433
+ Call AI with round-based retries and exponential backoff.
431
434
 
432
- AIDEV-NOTE: Tiered fallback system:
433
- 1. First tries all primary models with backoff
434
- 2. If primary exhausted and use_fallback=True, tries fallback models
435
- 3. Fallback models are cheaper/simpler backups (haiku, flash, etc.)
435
+ AIDEV-NOTE: No fallback tier — if all primary models are down, fail fast
436
+ so monitoring catches it. Escalation to big models is handled separately
437
+ by evolve_worker when code quality issues are detected.
436
438
 
437
439
  Args:
438
440
  prompt: The prompt to send
439
441
  command: "run" or "ideate" - determines model pool
440
442
  working_dir: Directory for file operations
441
443
  env_vars: Additional environment variables
442
- max_rounds: Maximum number of full rounds per tier
444
+ max_rounds: Maximum number of full rounds
443
445
  initial_wait: Initial wait time in seconds after first failed round
444
446
  max_wait: Maximum wait time in seconds between rounds
445
- use_fallback: Whether to try fallback tier if primary fails
446
447
 
447
448
  Returns:
448
449
  Tuple of (output, model_name)
449
450
 
450
451
  Raises:
451
- AIError: If all tiers exhausted without success
452
+ AIError: If all models exhausted without success
452
453
  """
453
- # Try primary tier first
454
454
  primary_models = get_models_for_command(command)
455
455
  if not primary_models:
456
456
  raise AIError(f"No primary models configured for command: {command}")
@@ -463,28 +463,58 @@ def call_ai_with_backoff(
463
463
  if output is not None:
464
464
  return output, model_name
465
465
 
466
- # Primary exhausted - try fallback if enabled
467
- if use_fallback:
468
- fallback_models = get_fallback_models_for_command(command)
469
- if fallback_models:
470
- _log(f"Primary tier exhausted, trying {len(fallback_models)} fallback models...")
466
+ # All models exhausted fail fast, let monitoring catch it
467
+ error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in list(primary_errors.items())[:3])
468
+ raise AIError(f"All primary models exhausted ({max_rounds} rounds). Last errors: {error_summary}")
471
469
 
472
- output, model_name, fallback_errors = _try_models_with_backoff(
473
- prompt, fallback_models, "Fallback",
474
- working_dir, env_vars, max_rounds, initial_wait, max_wait
475
- )
476
470
 
477
- if output is not None:
478
- return output, model_name
471
+ def call_ai_escalation(
472
+ prompt: str,
473
+ command: str = "run",
474
+ working_dir: Optional[str] = None,
475
+ env_vars: Optional[dict] = None
476
+ ) -> Tuple[str, str]:
477
+ """
478
+ Try each escalation model once (no backoff). Used when cheap models produced
479
+ bad code (syntax/validation errors) and we need a smarter model to fix it.
479
480
 
480
- # Both tiers exhausted
481
- all_errors = {**primary_errors, **fallback_errors}
482
- error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in list(all_errors.items())[:3])
483
- raise AIError(f"All tiers exhausted ({max_rounds} rounds each). Last errors: {error_summary}")
481
+ AIDEV-NOTE: Quality-triggered escalation. Each model gets one shot.
482
+ If all escalation models fail (rate limits etc), raise AIError — the candidate
483
+ will be marked failed-ai-retry so it can be retried when limits reset.
484
484
 
485
- # Primary exhausted, no fallback
486
- error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in list(primary_errors.items())[:3])
487
- raise AIError(f"Primary tier exhausted ({max_rounds} rounds). Last errors: {error_summary}")
485
+ Args:
486
+ prompt: The prompt to send (should include error context)
487
+ command: "run" or "ideate" - determines escalation model pool
488
+ working_dir: Directory for file operations
489
+ env_vars: Additional environment variables
490
+
491
+ Returns:
492
+ Tuple of (output, model_name)
493
+
494
+ Raises:
495
+ AIError: If no escalation models available or all failed
496
+ """
497
+ escalation_models = get_escalation_models_for_command(command)
498
+ if not escalation_models:
499
+ raise AIError(f"No escalation models configured for command: {command}")
500
+
501
+ # Shuffle so we don't always burn the same model first
502
+ models = escalation_models.copy()
503
+ random.shuffle(models)
504
+
505
+ last_errors = {}
506
+ for model in models:
507
+ try:
508
+ _log(f"Escalation: trying {model}...")
509
+ output, model_name = call_ai_model(prompt, model, working_dir, env_vars)
510
+ _log(f"Escalation succeeded with {model}")
511
+ return output, model_name
512
+ except AIError as e:
513
+ _log(f"Escalation {model} failed: {str(e)[:60]}...")
514
+ last_errors[model] = str(e)
515
+
516
+ error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in last_errors.items())
517
+ raise AIError(f"All escalation models failed. Errors: {error_summary}")
488
518
 
489
519
 
490
520
  def call_ai_for_file_edit(
package/lib/config.py CHANGED
File without changes
package/lib/config.sh CHANGED
@@ -57,20 +57,22 @@ DEFAULT_MEMORY_LIMIT_MB=12288
57
57
  # Workers will exit after processing this many candidates to pick up library updates
58
58
  DEFAULT_WORKER_MAX_CANDIDATES=3
59
59
 
60
- # Default LLM CLI configuration - tiered fallback system
61
- # Primary: Strong models used in normal operation
62
- # Fallback: Cheap/backup models used only when primary tier exhausted
60
+ # Default LLM CLI configuration - quality-triggered escalation system
61
+ # AIDEV-NOTE: Two-tier design for coding (run):
62
+ # Primary: Cheap/open models handle normal code generation
63
+ # Escalation: Big commercial models only activated on syntax/validation failure
64
+ # Ideation keeps its own primary list (thinking models for creative work)
65
+ # No fallback tier — if all models are down, fail fast so monitoring catches it.
63
66
  #
64
- # Run: Subscription-based agentic models for code generation
65
- # All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
66
- # Ollama cloud models are flat-rate (subscription), so prefer them over per-token OpenRouter
67
- DEFAULT_LLM_RUN="gemini-pro gemini-pro ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder codex-coding codex-coding glm-zai qwen-coder minimax sonnet"
68
- DEFAULT_LLM_RUN_FALLBACK="haiku ollama-glm ollama-gemma ollama-minimax ollama-qwen glm-zai gemini-cheap codex-spark qwen"
67
+ # Run: Cheap/open models for code generation (flat-rate or low-cost)
68
+ DEFAULT_LLM_RUN="ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder glm-zai qwen-coder minimax"
69
+ # Run escalation: Big models activated only when cheap models produce bad code
70
+ DEFAULT_LLM_RUN_ESCALATION="sonnet codex-coding gemini-pro"
69
71
  #
70
- # Ideate: Agentic models that can edit files for ideation
71
- # All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
72
+ # Ideate: Strong models for creative ideation
72
73
  DEFAULT_LLM_IDEATE="opus-think ollama-glm ollama-glm gemini-pro ollama-qwen ollama-minimax ollama-gemma kimi-coder gpt codex-think glm-zai qwen-coder minimax qwen"
73
- DEFAULT_LLM_IDEATE_FALLBACK="haiku ollama-glm ollama-gemma ollama-minimax ollama-qwen glm-zai gemini-cheap codex-spark qwen"
74
+ # Ideate escalation: not currently used but available for future use
75
+ DEFAULT_LLM_IDEATE_ESCALATION=""
74
76
 
75
77
  # Load configuration from a YAML file and update variables
76
78
  _load_yaml_config() {
@@ -147,12 +149,16 @@ _load_yaml_config() {
147
149
  lock_timeout) LOCK_TIMEOUT="$value" ;;
148
150
  esac
149
151
  elif [[ $in_llm_cli_section == true ]]; then
150
- if [[ $key == "run" || $key == "ideate" || $key == "run_fallback" || $key == "ideate_fallback" ]]; then
152
+ if [[ $key == "run" || $key == "ideate" || $key == "run_escalation" || $key == "ideate_escalation" || $key == "run_fallback" || $key == "ideate_fallback" ]]; then
151
153
  case $key in
152
154
  run) LLM_RUN="$value" ;;
153
- run_fallback) LLM_RUN_FALLBACK="$value" ;;
155
+ run_escalation) LLM_RUN_ESCALATION="$value" ;;
156
+ # Legacy fallback keys map to escalation for backward compatibility
157
+ run_fallback) LLM_RUN_ESCALATION="$value" ;;
154
158
  ideate) LLM_IDEATE="$value" ;;
155
- ideate_fallback) LLM_IDEATE_FALLBACK="$value" ;;
159
+ ideate_escalation) LLM_IDEATE_ESCALATION="$value" ;;
160
+ # Legacy fallback keys map to escalation for backward compatibility
161
+ ideate_fallback) LLM_IDEATE_ESCALATION="$value" ;;
156
162
  esac
157
163
  else
158
164
  value=$(echo "$value" | sed "s/^'//;s/'$//")
@@ -223,9 +229,9 @@ load_config() {
223
229
  WORKER_MAX_CANDIDATES="$DEFAULT_WORKER_MAX_CANDIDATES"
224
230
 
225
231
  LLM_RUN="$DEFAULT_LLM_RUN"
226
- LLM_RUN_FALLBACK="$DEFAULT_LLM_RUN_FALLBACK"
232
+ LLM_RUN_ESCALATION="$DEFAULT_LLM_RUN_ESCALATION"
227
233
  LLM_IDEATE="$DEFAULT_LLM_IDEATE"
228
- LLM_IDEATE_FALLBACK="$DEFAULT_LLM_IDEATE_FALLBACK"
234
+ LLM_IDEATE_ESCALATION="$DEFAULT_LLM_IDEATE_ESCALATION"
229
235
 
230
236
  # Determine local config file path relative to EVOLUTION_DIR
231
237
  local local_config_file="$EVOLUTION_DIR/config.yaml"
@@ -318,7 +324,7 @@ show_config() {
318
324
  echo " Memory limit: ${MEMORY_LIMIT_MB}MB"
319
325
  echo " Worker max candidates: $WORKER_MAX_CANDIDATES"
320
326
  echo " LLM for run: $LLM_RUN"
321
- echo " LLM for run (fallback): $LLM_RUN_FALLBACK"
327
+ echo " LLM for run (escalation): $LLM_RUN_ESCALATION"
322
328
  echo " LLM for ideate: $LLM_IDEATE"
323
- echo " LLM for ideate (fallback): $LLM_IDEATE_FALLBACK"
329
+ echo " LLM for ideate (escalation): $LLM_IDEATE_ESCALATION"
324
330
  }
package/lib/csv-lock.sh CHANGED
File without changes
package/lib/editor.sh CHANGED
File without changes
File without changes
File without changes
File without changes
@@ -35,8 +35,8 @@ set_prefix("WORKER")
35
35
 
36
36
  from lib.evolution_csv import EvolutionCSV
37
37
  from lib.ai_cli import (
38
- call_ai_with_backoff, call_ai_model, get_models_for_command,
39
- get_git_protection_warning, AIError
38
+ call_ai_with_backoff, call_ai_escalation, call_ai_model,
39
+ get_models_for_command, get_git_protection_warning, AIError
40
40
  )
41
41
  from lib.llm_bandit import LLMBandit
42
42
 
@@ -170,11 +170,12 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
170
170
 
171
171
  def _call_ai_with_backoff(self, prompt: str, target_file: Path) -> Tuple[bool, str]:
172
172
  """
173
- Call AI with bandit-based model selection and fallback.
173
+ Call AI with bandit-based model selection from the primary (cheap) tier.
174
174
 
175
175
  AIDEV-NOTE: First tries model selected by UCB bandit.
176
- If that fails, falls back to round-robin retry approach.
177
- The bandit learns which models produce better algorithm improvements.
176
+ If that fails, retries with round-robin across all primary models.
177
+ The bandit learns which cheap models produce better algorithm improvements.
178
+ Escalation to big models is handled separately on quality failures.
178
179
 
179
180
  Returns:
180
181
  Tuple of (success, model_name)
@@ -203,14 +204,14 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
203
204
  else:
204
205
  # AIDEV-NOTE: Log output so we can diagnose why file wasn't modified
205
206
  preview = output[-300:] if output else "(empty)"
206
- log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying fallback...")
207
+ log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying other primary models...")
207
208
  log(f"AI output preview: {preview}")
208
209
  # AIDEV-NOTE: Report no-modification as failure to bandit
209
210
  self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
210
211
  log(f"Bandit update: {selected_model} no file modification")
211
212
 
212
213
  except AIError as e:
213
- log(f"Bandit model {selected_model} failed: {e}, trying fallback...")
214
+ log(f"Bandit model {selected_model} failed: {e}, trying other primary models...")
214
215
  # AIDEV-NOTE: Report AI-level failure to bandit so it learns to avoid broken models
215
216
  self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
216
217
  log(f"Bandit update: {selected_model} AI call failed")
@@ -244,6 +245,42 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
244
245
  log_error(f"All AI retries exhausted: {e}")
245
246
  return False, ""
246
247
 
248
+ def _call_ai_escalated(self, prompt: str, target_file: Path) -> Tuple[bool, str]:
249
+ """
250
+ Call escalation-tier AI (big commercial models) to fix code quality issues.
251
+
252
+ AIDEV-NOTE: Quality-triggered escalation. Only called when cheap primary models
253
+ produced code with syntax or validation errors. Each escalation model gets one
254
+ shot — no backoff loops. If all escalation models fail, the candidate should be
255
+ marked failed-ai-retry (API limits) not failed-validation (bad idea).
256
+
257
+ Returns:
258
+ Tuple of (success, model_name)
259
+ """
260
+ hash_before = self._file_hash(target_file) if target_file.exists() else None
261
+
262
+ try:
263
+ output, model = call_ai_escalation(
264
+ prompt,
265
+ command="run",
266
+ working_dir=self.config.evolution_dir
267
+ )
268
+
269
+ hash_after = self._file_hash(target_file) if target_file.exists() else None
270
+
271
+ if hash_before != hash_after and hash_after is not None:
272
+ log(f"Escalation AI successfully modified file (model: {model})")
273
+ return True, model
274
+ else:
275
+ preview = output[-300:] if output else "(empty)"
276
+ log(f"Escalation AI completed but did not modify file ({len(output)} chars)")
277
+ log(f"AI output preview: {preview}")
278
+ return False, model
279
+
280
+ except AIError as e:
281
+ log_error(f"All escalation models failed: {e}")
282
+ return False, ""
283
+
247
284
  def _file_hash(self, path: Path) -> Optional[str]:
248
285
  """Get file hash."""
249
286
  try:
@@ -550,59 +587,93 @@ python validator.py {target_basename}
550
587
  with EvolutionCSV(self.config.csv_path) as csv:
551
588
  csv.update_candidate_field(candidate.id, 'run-LLM', model)
552
589
 
590
+ # AIDEV-NOTE: Quality-triggered escalation system.
591
+ # Phase 1: Check syntax from cheap model output
592
+ # Phase 2: If syntax fails, escalate to big model with error context
593
+ # Phase 3: Validate, if fails escalate to big model with error context
594
+ # If escalation models also fail (API limits), mark failed-ai-retry.
595
+ # If escalation models produce code but it's still bad, mark failed-validation.
596
+
553
597
  # Check syntax
554
598
  if not self._check_syntax(target_file):
555
- log_error("Syntax error in generated file")
556
- target_file.unlink(missing_ok=True)
557
- with EvolutionCSV(self.config.csv_path) as csv:
558
- csv.update_candidate_status(candidate.id, 'pending')
559
- return 0 # Will retry
599
+ log("Syntax error from primary model, escalating to big model...")
600
+ # Get the syntax error details for context
601
+ syntax_result = subprocess.run(
602
+ [self.config.python_cmd, "-m", "py_compile", str(target_file)],
603
+ capture_output=True, text=True
604
+ )
605
+ syntax_error = syntax_result.stderr.strip()
560
606
 
561
- # Run validator with retry loop
562
- # AIDEV-NOTE: Validator catches structural errors before expensive full evaluation.
563
- # If validation fails, we give the AI feedback and ask it to fix the code.
564
- validation_passed = False
565
- for validation_attempt in range(self.config.max_validation_retries + 1):
566
- valid, error_info = self._run_validator(candidate.id)
607
+ fix_prompt = self._build_fix_prompt(
608
+ candidate, target_file.name,
609
+ {'error_type': 'syntax', 'error': syntax_error}
610
+ )
611
+ success, fix_model = self._call_ai_escalated(fix_prompt, target_file)
567
612
 
568
- if valid:
569
- validation_passed = True
570
- break
613
+ if not success:
614
+ log_error("Escalation models failed to fix syntax error")
615
+ target_file.unlink(missing_ok=True)
616
+ with EvolutionCSV(self.config.csv_path) as csv:
617
+ csv.update_candidate_status(candidate.id, 'failed-ai-retry')
618
+ return 77
571
619
 
572
- if validation_attempt >= self.config.max_validation_retries:
573
- log_error(f"Validation failed after {self.config.max_validation_retries} fix attempts")
574
- break
620
+ # Record escalation model
621
+ if fix_model:
622
+ with EvolutionCSV(self.config.csv_path) as csv:
623
+ current_llm = csv.get_candidate_info(candidate.id).get('run-LLM', '')
624
+ new_llm = f"{current_llm}+ESC:{fix_model}" if current_llm else f"ESC:{fix_model}"
625
+ csv.update_candidate_field(candidate.id, 'run-LLM', new_llm)
626
+
627
+ # Re-check syntax after escalation fix
628
+ if not self._check_syntax(target_file):
629
+ log_error("Escalation model also produced syntax error — idea too hard")
630
+ target_file.unlink(missing_ok=True)
631
+ with EvolutionCSV(self.config.csv_path) as csv:
632
+ csv.update_candidate_status(candidate.id, 'failed-validation')
633
+ return 1
634
+
635
+ # Run validator with escalation on failure
636
+ # AIDEV-NOTE: Validator catches structural errors before expensive full evaluation.
637
+ # First attempt uses the code as-is from primary model. On failure, escalate once.
638
+ valid, error_info = self._run_validator(candidate.id)
575
639
 
576
- # Ask AI to fix the validation error
577
- log(f"Validation failed (attempt {validation_attempt + 1}), asking AI to fix...")
640
+ if not valid:
641
+ log("Validation failed from primary model, escalating to big model...")
578
642
  fix_prompt = self._build_fix_prompt(candidate, target_file.name, error_info)
579
- success, fix_model = self._call_ai_with_backoff(fix_prompt, target_file)
643
+ success, fix_model = self._call_ai_escalated(fix_prompt, target_file)
580
644
 
581
645
  if not success:
582
- log_error("AI failed to fix validation error")
583
- break
646
+ log_error("Escalation models failed to fix validation error")
647
+ target_file.unlink(missing_ok=True)
648
+ with EvolutionCSV(self.config.csv_path) as csv:
649
+ csv.update_candidate_status(candidate.id, 'failed-ai-retry')
650
+ return 77
584
651
 
585
- # Record that we used an additional model call for fixing
652
+ # Record escalation model
586
653
  if fix_model:
587
654
  with EvolutionCSV(self.config.csv_path) as csv:
588
655
  current_llm = csv.get_candidate_info(candidate.id).get('run-LLM', '')
589
- new_llm = f"{current_llm}+{fix_model}" if current_llm else fix_model
656
+ new_llm = f"{current_llm}+ESC:{fix_model}" if current_llm else f"ESC:{fix_model}"
590
657
  csv.update_candidate_field(candidate.id, 'run-LLM', new_llm)
591
658
 
592
- # Re-check syntax after fix
659
+ # Check syntax after escalation fix (escalation might break it)
593
660
  if not self._check_syntax(target_file):
594
- log_error("Fix introduced syntax error")
595
- # Don't break - try again if we have retries left
661
+ log_error("Escalation fix introduced syntax error")
662
+ target_file.unlink(missing_ok=True)
663
+ with EvolutionCSV(self.config.csv_path) as csv:
664
+ csv.update_candidate_status(candidate.id, 'failed-validation')
665
+ return 1
596
666
 
597
- if not validation_passed:
598
- # Validation failed after all retries
599
- with EvolutionCSV(self.config.csv_path) as csv:
600
- csv.update_candidate_status(candidate.id, 'failed-validation')
601
- # Store the last error for debugging
602
- if error_info:
603
- error_summary = f"{error_info.get('error_type', 'unknown')}: {error_info.get('error', '')[:100]}"
604
- csv.update_candidate_field(candidate.id, 'validation_error', error_summary)
605
- return 1
667
+ # Re-validate after escalation fix
668
+ valid, error_info = self._run_validator(candidate.id)
669
+ if not valid:
670
+ log_error("Validation still fails after escalation — idea too hard")
671
+ with EvolutionCSV(self.config.csv_path) as csv:
672
+ csv.update_candidate_status(candidate.id, 'failed-validation')
673
+ if error_info:
674
+ error_summary = f"{error_info.get('error_type', 'unknown')}: {error_info.get('error', '')[:100]}"
675
+ csv.update_candidate_field(candidate.id, 'validation_error', error_summary)
676
+ return 1
606
677
 
607
678
  # Run evaluator
608
679
  log("Running evaluator...")
package/lib/llm_bandit.py CHANGED
File without changes
package/lib/log.py CHANGED
File without changes
File without changes
package/lib/sandbox.sb CHANGED
File without changes
File without changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-evolve",
3
- "version": "1.12.0",
3
+ "version": "1.14.0",
4
4
  "bin": {
5
5
  "claude-evolve": "bin/claude-evolve",
6
6
  "claude-evolve-main": "bin/claude-evolve-main",
File without changes
File without changes
@@ -86,47 +86,49 @@ parallel:
86
86
  # Timeout in seconds when waiting for CSV locks
87
87
  lock_timeout: 30
88
88
 
89
- # LLM/AI CLI configuration
89
+ # LLM/AI CLI configuration — quality-triggered escalation system
90
+ # AIDEV-NOTE: Two tiers for coding:
91
+ # Primary (run): Cheap/open models for normal code generation
92
+ # Escalation (run_escalation): Big models activated ONLY on syntax/validation failure
93
+ # No fallback tier — if all primary models are down, fail fast so monitoring catches it.
90
94
  llm_cli:
91
- # What to run for each sub-command
92
- # Models are tried in order, with round-robin distribution across candidates
93
- # You can repeat models for weighted selection (e.g., "gemini-pro gemini-pro sonnet" for 2:1 ratio)
95
+ # Models are selected randomly, repeat for weighted selection (e.g., "model model" = 2x weight)
96
+ # Commented out because defaults change over time; uncomment to override
94
97
 
95
- # Default configuration: sonnet at ~11%, rest doubled for cost savings
96
- # Commented out because these change over time; uncomment to override
97
- #run: gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder codex-coding codex-coding sonnet
98
- #ideate: opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2 codex-think qwen-openrouter
99
- #run_fallback: haiku glm-5-zai gemini-5-flash codex-spark
100
- #ideate_fallback: haiku glm-5-zai gemini-5-flash codex-spark
98
+ # Coding: cheap/open models (flat-rate or low per-token cost)
99
+ #run: ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder glm-zai qwen-coder minimax
100
+
101
+ # Coding escalation: big commercial models, only used when primary produces bad code
102
+ #run_escalation: sonnet codex-coding gemini-pro
103
+
104
+ # Ideation: strong thinking models for creative work
105
+ #ideate: opus-think ollama-glm ollama-glm gemini-pro ollama-qwen ollama-minimax ollama-gemma kimi-coder gpt codex-think glm-zai qwen-coder minimax qwen
106
+
107
+ # Ideation escalation: not currently used
108
+ #ideate_escalation:
101
109
 
102
110
  # Available models:
103
- # Claude (subscription-based, watch usage limits):
104
- # - sonnet: Claude Sonnet via Claude CLI
105
- # - sonnet-think: Claude Sonnet with extended thinking (ultrathink prompt)
106
- # - opus: Claude Opus via Claude CLI
107
- # - opus-think: Claude Opus with extended thinking (ultrathink prompt)
108
- # - haiku: Claude Haiku via Claude CLI (cheap fallback)
109
111
  #
110
- # Codex/OpenAI (subscription-based):
111
- # - codex-think: GPT-5.4 high reasoning effort (ideation)
112
- # - codex-coding: GPT-5.4 medium reasoning effort (coding/run)
113
- # - codex-spark: GPT-5.1 Codex Mini (lightweight fallback)
114
- # - gpt-5.4: GPT-5.4 no reasoning effort override via Codex CLI
115
- # - gpt-5.2: GPT-5.2 via Codex CLI
116
- # - gpt-5.3-codex: GPT-5.3 Codex (code-specialized) via Codex CLI
117
- # - gpt5: GPT-5 via Codex CLI (legacy alias)
118
- # - gpt5high: GPT-5 via Codex CLI (high reasoning)
119
- # - o3high: O3 via Codex CLI (high reasoning)
112
+ # --- Cheap/open (primary coding tier) ---
113
+ # Ollama cloud (flat-rate $20/mo subscription):
114
+ # - ollama-glm: GLM 5.1 via Ollama cloud
115
+ # - ollama-qwen: Qwen 3.6 via Ollama cloud
116
+ # - ollama-minimax: MiniMax M2.7 via Ollama cloud
117
+ # - ollama-gemma: Gemma 4 31B via Ollama cloud
118
+ #
119
+ # Low-cost APIs:
120
+ # - kimi-coder: Kimi for Coding via kimi CLI (subscription)
121
+ # - glm-zai: GLM 5 via Z.AI agentic mode
122
+ # - qwen-coder: Qwen 3 Coder via OpenRouter
123
+ # - minimax: MiniMax M2.7 via OpenRouter
124
+ # - qwen: Qwen 3.6 Plus via OpenRouter
120
125
  #
121
- # Gemini (free tier available):
122
- # - gemini-pro: Gemini 3 Pro Preview via Gemini CLI
123
- # - gemini-5-flash: Gemini 5 Flash via Gemini CLI (cheap fallback)
124
- # - gemini-flash: Gemini 2.5 Flash via Gemini CLI (legacy)
126
+ # --- Big commercial (escalation tier) ---
127
+ # - sonnet: Claude Sonnet via Claude CLI
128
+ # - codex-coding: GPT-5.4 medium reasoning via Codex CLI
129
+ # - gemini-pro: Gemini 3 Pro via Gemini CLI
125
130
  #
126
- # Other free/cheap models:
127
- # - glm-5-zai: GLM-5 via Z.AI agentic mode
128
- # - kimi-coder: Kimi for Coding via kimi CLI (fast, good for code gen)
129
- # - kimi-k2-openrouter: Kimi K2 Thinking via OpenRouter
130
- # - codex-oss-local: Local model via Codex + Ollama
131
- # - cursor-sonnet: Claude Sonnet via Cursor Agent CLI
132
- # - cursor-opus: Claude Opus via Cursor Agent CLI
131
+ # --- Ideation models ---
132
+ # - opus-think: Claude Opus with extended thinking
133
+ # - codex-think: GPT-5.5 high reasoning effort
134
+ # - gpt: GPT-5.5 via Codex CLI
File without changes
Binary file