npm - claude-evolve - Versions diffs - 1.13.0 → 1.14.0 - Mend

claude-evolve 1.13.0 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/README.md +0 -0
package/lib/__pycache__/ai_cli.cpython-311.pyc +0 -0
package/lib/__pycache__/ai_cli.cpython-314.pyc +0 -0
package/lib/__pycache__/embedding.cpython-314.pyc +0 -0
package/lib/__pycache__/evolution_csv.cpython-311.pyc +0 -0
package/lib/__pycache__/evolution_csv.cpython-313.pyc +0 -0
package/lib/__pycache__/evolution_csv.cpython-314.pyc +0 -0
package/lib/__pycache__/evolve_ideate.cpython-314.pyc +0 -0
package/lib/__pycache__/evolve_run.cpython-311.pyc +0 -0
package/lib/__pycache__/evolve_run.cpython-314.pyc +0 -0
package/lib/__pycache__/evolve_worker.cpython-314.pyc +0 -0
package/lib/__pycache__/llm_bandit.cpython-314.pyc +0 -0
package/lib/__pycache__/log.cpython-311.pyc +0 -0
package/lib/__pycache__/log.cpython-314.pyc +0 -0
package/lib/__pycache__/meta_learning.cpython-314.pyc +0 -0
package/lib/__pycache__/sandbox_wrapper.cpython-314.pyc +0 -0
package/lib/ai-cli.sh +11 -9
package/lib/ai_cli.py +64 -34
package/lib/config.py +0 -0
package/lib/config.sh +24 -18
package/lib/csv-lock.sh +0 -0
package/lib/editor.sh +0 -0
package/lib/evolution_csv.py +0 -0
package/lib/evolution_processor.py +0 -0
package/lib/evolve_ideate.py +0 -0
package/lib/evolve_worker.py +114 -43
package/lib/llm_bandit.py +0 -0
package/lib/log.py +0 -0
package/lib/meta_learning.py +0 -0
package/lib/sandbox.sb +0 -0
package/lib/sandbox_wrapper.py +0 -0
package/package.json +1 -1
package/templates/BRIEF.md +0 -0
package/templates/algorithm.py +0 -0
package/templates/config.yaml +39 -37
package/templates/evaluator.py +0 -0
package/lib/__pycache__/ai_cli.cpython-310.pyc +0 -0
package/lib/__pycache__/embedding.cpython-310.pyc +0 -0
package/lib/__pycache__/evolution_csv.cpython-310.pyc +0 -0
package/lib/__pycache__/evolve_ideate.cpython-310.pyc +0 -0
package/lib/__pycache__/log.cpython-310.pyc +0 -0

package/README.md CHANGED Viewed

File without changes

package/lib/__pycache__/ai_cli.cpython-311.pyc ADDED Viewed

Binary file

package/lib/__pycache__/ai_cli.cpython-314.pyc CHANGED Viewed

Binary file

package/lib/__pycache__/embedding.cpython-314.pyc CHANGED Viewed

Binary file

package/lib/__pycache__/evolution_csv.cpython-311.pyc ADDED Viewed

Binary file

package/lib/__pycache__/evolution_csv.cpython-313.pyc ADDED Viewed

Binary file

package/lib/__pycache__/evolution_csv.cpython-314.pyc CHANGED Viewed

Binary file

package/lib/__pycache__/evolve_ideate.cpython-314.pyc CHANGED Viewed

Binary file

package/lib/__pycache__/evolve_run.cpython-311.pyc ADDED Viewed

Binary file

package/lib/__pycache__/evolve_run.cpython-314.pyc ADDED Viewed

Binary file

package/lib/__pycache__/evolve_worker.cpython-314.pyc ADDED Viewed

Binary file

package/lib/__pycache__/llm_bandit.cpython-314.pyc ADDED Viewed

Binary file

package/lib/__pycache__/log.cpython-311.pyc ADDED Viewed

Binary file

package/lib/__pycache__/log.cpython-314.pyc ADDED Viewed

Binary file

package/lib/__pycache__/meta_learning.cpython-314.pyc ADDED Viewed

Binary file

package/lib/__pycache__/sandbox_wrapper.cpython-314.pyc ADDED Viewed

Binary file

package/lib/ai-cli.sh CHANGED Viewed

@@ -53,7 +53,7 @@ EOF
 call_ai_model_configured() {
   local model_name="$1"
   local prompt="$2"
-  local codex_gpt_model="${CODEX_GPT_MODEL:-${CODEX_GPT5_MODEL:-gpt-5.2}}"
+  local codex_gpt_model="${CODEX_GPT_MODEL:-${CODEX_GPT5_MODEL:-gpt-5.5}}"
   # Record start time
   local start_time=$(date +%s)
@@ -123,7 +123,7 @@ $prompt"
     codex-think)
       local ai_output
       # High reasoning - for ideation tasks requiring deep thinking
-      ai_output=$(codex exec -m gpt-5.4 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
+      ai_output=$(codex exec -m gpt-5.5 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
       local ai_exit_code=$?
       ;;
     codex-coding)
@@ -135,7 +135,7 @@ $prompt"
     codex-spark)
       local ai_output
       # Cheap/fast lightweight fallback
-      ai_output=$(codex exec -m gpt-5.1-codex-mini --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
+      ai_output=$(codex exec -m gpt-5.4-mini --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
       local ai_exit_code=$?
       ;;
     # --- Gemini (subscription) ---
@@ -365,19 +365,21 @@ get_models_for_command() {
   echo "$model_list"
 }
-# Get fallback models for a specific command (run or ideate)
-# Usage: get_fallback_models_for_command <command>
-# Returns: Space-separated list of fallback model names
-get_fallback_models_for_command() {
+# Get escalation models for a specific command (run or ideate)
+# AIDEV-NOTE: Escalation models are big/commercial models used only when
+# cheap primary models produce code with syntax or validation errors.
+# Usage: get_escalation_models_for_command <command>
+# Returns: Space-separated list of escalation model names
+get_escalation_models_for_command() {
   local command="$1"
   local model_list=""
   case "$command" in
     run)
-      model_list="$LLM_RUN_FALLBACK"
+      model_list="$LLM_RUN_ESCALATION"
       ;;
     ideate)
-      model_list="$LLM_IDEATE_FALLBACK"
+      model_list="$LLM_IDEATE_ESCALATION"
       ;;
     *)
       echo "[ERROR] Unknown command: $command" >&2

package/lib/ai_cli.py CHANGED Viewed

@@ -202,22 +202,26 @@ def get_models_for_command(command: str) -> List[str]:
     return model_list.split()
-def get_fallback_models_for_command(command: str) -> List[str]:
+def get_escalation_models_for_command(command: str) -> List[str]:
     """
-    Get the list of fallback models for a command.
+    Get the list of escalation models for a command.
+    AIDEV-NOTE: Escalation models are big/commercial models (sonnet, codex, gemini)
+    used only when cheap primary models produce code with syntax or validation errors.
+    They are NOT for API-down situations — if primary is down, fail fast.
     Args:
         command: Either "run" or "ideate"
     Returns:
-        List of fallback model names
+        List of escalation model names
     """
     bash_script = f'''
         source "{SCRIPT_DIR}/config.sh"
         load_config
         case "$1" in
-            run) echo "$LLM_RUN_FALLBACK" ;;
-            ideate) echo "$LLM_IDEATE_FALLBACK" ;;
+            run) echo "$LLM_RUN_ESCALATION" ;;
+            ideate) echo "$LLM_IDEATE_ESCALATION" ;;
         esac
     '''
@@ -423,34 +427,30 @@ def call_ai_with_backoff(
     env_vars: Optional[dict] = None,
     max_rounds: int = 10,
     initial_wait: int = 60,
-    max_wait: int = 600,
-    use_fallback: bool = True
+    max_wait: int = 600
 ) -> Tuple[str, str]:
     """
-    Call AI with tiered fallback and round-based retries with exponential backoff.
+    Call AI with round-based retries and exponential backoff.
-    AIDEV-NOTE: Tiered fallback system:
-    1. First tries all primary models with backoff
-    2. If primary exhausted and use_fallback=True, tries fallback models
-    3. Fallback models are cheaper/simpler backups (haiku, flash, etc.)
+    AIDEV-NOTE: No fallback tier — if all primary models are down, fail fast
+    so monitoring catches it. Escalation to big models is handled separately
+    by evolve_worker when code quality issues are detected.
     Args:
         prompt: The prompt to send
         command: "run" or "ideate" - determines model pool
         working_dir: Directory for file operations
         env_vars: Additional environment variables
-        max_rounds: Maximum number of full rounds per tier
+        max_rounds: Maximum number of full rounds
         initial_wait: Initial wait time in seconds after first failed round
         max_wait: Maximum wait time in seconds between rounds
-        use_fallback: Whether to try fallback tier if primary fails
     Returns:
         Tuple of (output, model_name)
     Raises:
-        AIError: If all tiers exhausted without success
+        AIError: If all models exhausted without success
     """
-    # Try primary tier first
     primary_models = get_models_for_command(command)
     if not primary_models:
         raise AIError(f"No primary models configured for command: {command}")
@@ -463,28 +463,58 @@ def call_ai_with_backoff(
     if output is not None:
         return output, model_name
-    # Primary exhausted - try fallback if enabled
-    if use_fallback:
-        fallback_models = get_fallback_models_for_command(command)
-        if fallback_models:
-            _log(f"Primary tier exhausted, trying {len(fallback_models)} fallback models...")
+    # All models exhausted — fail fast, let monitoring catch it
+    error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in list(primary_errors.items())[:3])
+    raise AIError(f"All primary models exhausted ({max_rounds} rounds). Last errors: {error_summary}")
-            output, model_name, fallback_errors = _try_models_with_backoff(
-                prompt, fallback_models, "Fallback",
-                working_dir, env_vars, max_rounds, initial_wait, max_wait
-            )
-            if output is not None:
-                return output, model_name
+def call_ai_escalation(
+    prompt: str,
+    command: str = "run",
+    working_dir: Optional[str] = None,
+    env_vars: Optional[dict] = None
+) -> Tuple[str, str]:
+    """
+    Try each escalation model once (no backoff). Used when cheap models produced
+    bad code (syntax/validation errors) and we need a smarter model to fix it.
-            # Both tiers exhausted
-            all_errors = {**primary_errors, **fallback_errors}
-            error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in list(all_errors.items())[:3])
-            raise AIError(f"All tiers exhausted ({max_rounds} rounds each). Last errors: {error_summary}")
+    AIDEV-NOTE: Quality-triggered escalation. Each model gets one shot.
+    If all escalation models fail (rate limits etc), raise AIError — the candidate
+    will be marked failed-ai-retry so it can be retried when limits reset.
-    # Primary exhausted, no fallback
-    error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in list(primary_errors.items())[:3])
-    raise AIError(f"Primary tier exhausted ({max_rounds} rounds). Last errors: {error_summary}")
+    Args:
+        prompt: The prompt to send (should include error context)
+        command: "run" or "ideate" - determines escalation model pool
+        working_dir: Directory for file operations
+        env_vars: Additional environment variables
+    Returns:
+        Tuple of (output, model_name)
+    Raises:
+        AIError: If no escalation models available or all failed
+    """
+    escalation_models = get_escalation_models_for_command(command)
+    if not escalation_models:
+        raise AIError(f"No escalation models configured for command: {command}")
+    # Shuffle so we don't always burn the same model first
+    models = escalation_models.copy()
+    random.shuffle(models)
+    last_errors = {}
+    for model in models:
+        try:
+            _log(f"Escalation: trying {model}...")
+            output, model_name = call_ai_model(prompt, model, working_dir, env_vars)
+            _log(f"Escalation succeeded with {model}")
+            return output, model_name
+        except AIError as e:
+            _log(f"Escalation {model} failed: {str(e)[:60]}...")
+            last_errors[model] = str(e)
+    error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in last_errors.items())
+    raise AIError(f"All escalation models failed. Errors: {error_summary}")
 def call_ai_for_file_edit(

package/lib/config.py CHANGED Viewed

File without changes

package/lib/config.sh CHANGED Viewed

@@ -57,20 +57,22 @@ DEFAULT_MEMORY_LIMIT_MB=12288
 # Workers will exit after processing this many candidates to pick up library updates
 DEFAULT_WORKER_MAX_CANDIDATES=3
-# Default LLM CLI configuration - tiered fallback system
-# Primary: Strong models used in normal operation
-# Fallback: Cheap/backup models used only when primary tier exhausted
+# Default LLM CLI configuration - quality-triggered escalation system
+# AIDEV-NOTE: Two-tier design for coding (run):
+#   Primary: Cheap/open models handle normal code generation
+#   Escalation: Big commercial models only activated on syntax/validation failure
+# Ideation keeps its own primary list (thinking models for creative work)
+# No fallback tier — if all models are down, fail fast so monitoring catches it.
 #
-# Run: Subscription-based agentic models for code generation
-# All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
-# Ollama cloud models are flat-rate (subscription), so prefer them over per-token OpenRouter
-DEFAULT_LLM_RUN="gemini-pro gemini-pro ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder codex-coding codex-coding glm-zai qwen-coder minimax sonnet"
-DEFAULT_LLM_RUN_FALLBACK="haiku ollama-glm ollama-gemma ollama-minimax ollama-qwen glm-zai gemini-cheap codex-spark qwen"
+# Run: Cheap/open models for code generation (flat-rate or low-cost)
+DEFAULT_LLM_RUN="ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder glm-zai qwen-coder minimax"
+# Run escalation: Big models activated only when cheap models produce bad code
+DEFAULT_LLM_RUN_ESCALATION="sonnet codex-coding gemini-pro"
 #
-# Ideate: Agentic models that can edit files for ideation
-# All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
+# Ideate: Strong models for creative ideation
 DEFAULT_LLM_IDEATE="opus-think ollama-glm ollama-glm gemini-pro ollama-qwen ollama-minimax ollama-gemma kimi-coder gpt codex-think glm-zai qwen-coder minimax qwen"
-DEFAULT_LLM_IDEATE_FALLBACK="haiku ollama-glm ollama-gemma ollama-minimax ollama-qwen glm-zai gemini-cheap codex-spark qwen"
+# Ideate escalation: not currently used but available for future use
+DEFAULT_LLM_IDEATE_ESCALATION=""
 # Load configuration from a YAML file and update variables
 _load_yaml_config() {
@@ -147,12 +149,16 @@ _load_yaml_config() {
         lock_timeout) LOCK_TIMEOUT="$value" ;;
       esac
     elif [[ $in_llm_cli_section == true ]]; then
-      if [[ $key == "run" || $key == "ideate" || $key == "run_fallback" || $key == "ideate_fallback" ]]; then
+      if [[ $key == "run" || $key == "ideate" || $key == "run_escalation" || $key == "ideate_escalation" || $key == "run_fallback" || $key == "ideate_fallback" ]]; then
         case $key in
           run) LLM_RUN="$value" ;;
-          run_fallback) LLM_RUN_FALLBACK="$value" ;;
+          run_escalation) LLM_RUN_ESCALATION="$value" ;;
+          # Legacy fallback keys map to escalation for backward compatibility
+          run_fallback) LLM_RUN_ESCALATION="$value" ;;
           ideate) LLM_IDEATE="$value" ;;
-          ideate_fallback) LLM_IDEATE_FALLBACK="$value" ;;
+          ideate_escalation) LLM_IDEATE_ESCALATION="$value" ;;
+          # Legacy fallback keys map to escalation for backward compatibility
+          ideate_fallback) LLM_IDEATE_ESCALATION="$value" ;;
         esac
       else
         value=$(echo "$value" | sed "s/^'//;s/'$//")
@@ -223,9 +229,9 @@ load_config() {
   WORKER_MAX_CANDIDATES="$DEFAULT_WORKER_MAX_CANDIDATES"
   LLM_RUN="$DEFAULT_LLM_RUN"
-  LLM_RUN_FALLBACK="$DEFAULT_LLM_RUN_FALLBACK"
+  LLM_RUN_ESCALATION="$DEFAULT_LLM_RUN_ESCALATION"
   LLM_IDEATE="$DEFAULT_LLM_IDEATE"
-  LLM_IDEATE_FALLBACK="$DEFAULT_LLM_IDEATE_FALLBACK"
+  LLM_IDEATE_ESCALATION="$DEFAULT_LLM_IDEATE_ESCALATION"
   # Determine local config file path relative to EVOLUTION_DIR
   local local_config_file="$EVOLUTION_DIR/config.yaml"
@@ -318,7 +324,7 @@ show_config() {
   echo "  Memory limit: ${MEMORY_LIMIT_MB}MB"
   echo "  Worker max candidates: $WORKER_MAX_CANDIDATES"
   echo "  LLM for run: $LLM_RUN"
-  echo "  LLM for run (fallback): $LLM_RUN_FALLBACK"
+  echo "  LLM for run (escalation): $LLM_RUN_ESCALATION"
   echo "  LLM for ideate: $LLM_IDEATE"
-  echo "  LLM for ideate (fallback): $LLM_IDEATE_FALLBACK"
+  echo "  LLM for ideate (escalation): $LLM_IDEATE_ESCALATION"
 }

package/lib/csv-lock.sh CHANGED Viewed

File without changes

package/lib/editor.sh CHANGED Viewed

File without changes

package/lib/evolution_csv.py CHANGED Viewed

File without changes

package/lib/evolution_processor.py CHANGED Viewed

File without changes

package/lib/evolve_ideate.py CHANGED Viewed

File without changes

package/lib/evolve_worker.py CHANGED Viewed

@@ -35,8 +35,8 @@ set_prefix("WORKER")
 from lib.evolution_csv import EvolutionCSV
 from lib.ai_cli import (
-    call_ai_with_backoff, call_ai_model, get_models_for_command,
-    get_git_protection_warning, AIError
+    call_ai_with_backoff, call_ai_escalation, call_ai_model,
+    get_models_for_command, get_git_protection_warning, AIError
 )
 from lib.llm_bandit import LLMBandit
@@ -170,11 +170,12 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
     def _call_ai_with_backoff(self, prompt: str, target_file: Path) -> Tuple[bool, str]:
         """
-        Call AI with bandit-based model selection and fallback.
+        Call AI with bandit-based model selection from the primary (cheap) tier.
         AIDEV-NOTE: First tries model selected by UCB bandit.
-        If that fails, falls back to round-robin retry approach.
-        The bandit learns which models produce better algorithm improvements.
+        If that fails, retries with round-robin across all primary models.
+        The bandit learns which cheap models produce better algorithm improvements.
+        Escalation to big models is handled separately on quality failures.
         Returns:
             Tuple of (success, model_name)
@@ -203,14 +204,14 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
                 else:
                     # AIDEV-NOTE: Log output so we can diagnose why file wasn't modified
                     preview = output[-300:] if output else "(empty)"
-                    log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying fallback...")
+                    log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying other primary models...")
                     log(f"AI output preview: {preview}")
                     # AIDEV-NOTE: Report no-modification as failure to bandit
                     self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
                     log(f"Bandit update: {selected_model} no file modification")
             except AIError as e:
-                log(f"Bandit model {selected_model} failed: {e}, trying fallback...")
+                log(f"Bandit model {selected_model} failed: {e}, trying other primary models...")
                 # AIDEV-NOTE: Report AI-level failure to bandit so it learns to avoid broken models
                 self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
                 log(f"Bandit update: {selected_model} AI call failed")
@@ -244,6 +245,42 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
             log_error(f"All AI retries exhausted: {e}")
             return False, ""
+    def _call_ai_escalated(self, prompt: str, target_file: Path) -> Tuple[bool, str]:
+        """
+        Call escalation-tier AI (big commercial models) to fix code quality issues.
+        AIDEV-NOTE: Quality-triggered escalation. Only called when cheap primary models
+        produced code with syntax or validation errors. Each escalation model gets one
+        shot — no backoff loops. If all escalation models fail, the candidate should be
+        marked failed-ai-retry (API limits) not failed-validation (bad idea).
+        Returns:
+            Tuple of (success, model_name)
+        """
+        hash_before = self._file_hash(target_file) if target_file.exists() else None
+        try:
+            output, model = call_ai_escalation(
+                prompt,
+                command="run",
+                working_dir=self.config.evolution_dir
+            )
+            hash_after = self._file_hash(target_file) if target_file.exists() else None
+            if hash_before != hash_after and hash_after is not None:
+                log(f"Escalation AI successfully modified file (model: {model})")
+                return True, model
+            else:
+                preview = output[-300:] if output else "(empty)"
+                log(f"Escalation AI completed but did not modify file ({len(output)} chars)")
+                log(f"AI output preview: {preview}")
+                return False, model
+        except AIError as e:
+            log_error(f"All escalation models failed: {e}")
+            return False, ""
     def _file_hash(self, path: Path) -> Optional[str]:
         """Get file hash."""
         try:
@@ -550,59 +587,93 @@ python validator.py {target_basename}
                 with EvolutionCSV(self.config.csv_path) as csv:
                     csv.update_candidate_field(candidate.id, 'run-LLM', model)
+            # AIDEV-NOTE: Quality-triggered escalation system.
+            # Phase 1: Check syntax from cheap model output
+            # Phase 2: If syntax fails, escalate to big model with error context
+            # Phase 3: Validate, if fails escalate to big model with error context
+            # If escalation models also fail (API limits), mark failed-ai-retry.
+            # If escalation models produce code but it's still bad, mark failed-validation.
             # Check syntax
             if not self._check_syntax(target_file):
-                log_error("Syntax error in generated file")
-                target_file.unlink(missing_ok=True)
-                with EvolutionCSV(self.config.csv_path) as csv:
-                    csv.update_candidate_status(candidate.id, 'pending')
-                return 0  # Will retry
+                log("Syntax error from primary model, escalating to big model...")
+                # Get the syntax error details for context
+                syntax_result = subprocess.run(
+                    [self.config.python_cmd, "-m", "py_compile", str(target_file)],
+                    capture_output=True, text=True
+                )
+                syntax_error = syntax_result.stderr.strip()
-            # Run validator with retry loop
-            # AIDEV-NOTE: Validator catches structural errors before expensive full evaluation.
-            # If validation fails, we give the AI feedback and ask it to fix the code.
-            validation_passed = False
-            for validation_attempt in range(self.config.max_validation_retries + 1):
-                valid, error_info = self._run_validator(candidate.id)
+                fix_prompt = self._build_fix_prompt(
+                    candidate, target_file.name,
+                    {'error_type': 'syntax', 'error': syntax_error}
+                )
+                success, fix_model = self._call_ai_escalated(fix_prompt, target_file)
-                if valid:
-                    validation_passed = True
-                    break
+                if not success:
+                    log_error("Escalation models failed to fix syntax error")
+                    target_file.unlink(missing_ok=True)
+                    with EvolutionCSV(self.config.csv_path) as csv:
+                        csv.update_candidate_status(candidate.id, 'failed-ai-retry')
+                    return 77
-                if validation_attempt >= self.config.max_validation_retries:
-                    log_error(f"Validation failed after {self.config.max_validation_retries} fix attempts")
-                    break
+                # Record escalation model
+                if fix_model:
+                    with EvolutionCSV(self.config.csv_path) as csv:
+                        current_llm = csv.get_candidate_info(candidate.id).get('run-LLM', '')
+                        new_llm = f"{current_llm}+ESC:{fix_model}" if current_llm else f"ESC:{fix_model}"
+                        csv.update_candidate_field(candidate.id, 'run-LLM', new_llm)
+                # Re-check syntax after escalation fix
+                if not self._check_syntax(target_file):
+                    log_error("Escalation model also produced syntax error — idea too hard")
+                    target_file.unlink(missing_ok=True)
+                    with EvolutionCSV(self.config.csv_path) as csv:
+                        csv.update_candidate_status(candidate.id, 'failed-validation')
+                    return 1
+            # Run validator with escalation on failure
+            # AIDEV-NOTE: Validator catches structural errors before expensive full evaluation.
+            # First attempt uses the code as-is from primary model. On failure, escalate once.
+            valid, error_info = self._run_validator(candidate.id)
-                # Ask AI to fix the validation error
-                log(f"Validation failed (attempt {validation_attempt + 1}), asking AI to fix...")
+            if not valid:
+                log("Validation failed from primary model, escalating to big model...")
                 fix_prompt = self._build_fix_prompt(candidate, target_file.name, error_info)
-                success, fix_model = self._call_ai_with_backoff(fix_prompt, target_file)
+                success, fix_model = self._call_ai_escalated(fix_prompt, target_file)
                 if not success:
-                    log_error("AI failed to fix validation error")
-                    break
+                    log_error("Escalation models failed to fix validation error")
+                    target_file.unlink(missing_ok=True)
+                    with EvolutionCSV(self.config.csv_path) as csv:
+                        csv.update_candidate_status(candidate.id, 'failed-ai-retry')
+                    return 77
-                # Record that we used an additional model call for fixing
+                # Record escalation model
                 if fix_model:
                     with EvolutionCSV(self.config.csv_path) as csv:
                         current_llm = csv.get_candidate_info(candidate.id).get('run-LLM', '')
-                        new_llm = f"{current_llm}+{fix_model}" if current_llm else fix_model
+                        new_llm = f"{current_llm}+ESC:{fix_model}" if current_llm else f"ESC:{fix_model}"
                         csv.update_candidate_field(candidate.id, 'run-LLM', new_llm)
-                # Re-check syntax after fix
+                # Check syntax after escalation fix (escalation might break it)
                 if not self._check_syntax(target_file):
-                    log_error("Fix introduced syntax error")
-                    # Don't break - try again if we have retries left
+                    log_error("Escalation fix introduced syntax error")
+                    target_file.unlink(missing_ok=True)
+                    with EvolutionCSV(self.config.csv_path) as csv:
+                        csv.update_candidate_status(candidate.id, 'failed-validation')
+                    return 1
-            if not validation_passed:
-                # Validation failed after all retries
-                with EvolutionCSV(self.config.csv_path) as csv:
-                    csv.update_candidate_status(candidate.id, 'failed-validation')
-                    # Store the last error for debugging
-                    if error_info:
-                        error_summary = f"{error_info.get('error_type', 'unknown')}: {error_info.get('error', '')[:100]}"
-                        csv.update_candidate_field(candidate.id, 'validation_error', error_summary)
-                return 1
+                # Re-validate after escalation fix
+                valid, error_info = self._run_validator(candidate.id)
+                if not valid:
+                    log_error("Validation still fails after escalation — idea too hard")
+                    with EvolutionCSV(self.config.csv_path) as csv:
+                        csv.update_candidate_status(candidate.id, 'failed-validation')
+                        if error_info:
+                            error_summary = f"{error_info.get('error_type', 'unknown')}: {error_info.get('error', '')[:100]}"
+                            csv.update_candidate_field(candidate.id, 'validation_error', error_summary)
+                    return 1
         # Run evaluator
         log("Running evaluator...")

package/lib/llm_bandit.py CHANGED Viewed

File without changes

package/lib/log.py CHANGED Viewed

File without changes

package/lib/meta_learning.py CHANGED Viewed

File without changes

package/lib/sandbox.sb CHANGED Viewed

File without changes

package/lib/sandbox_wrapper.py CHANGED Viewed

File without changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "claude-evolve",
-  "version": "1.13.0",
+  "version": "1.14.0",
   "bin": {
     "claude-evolve": "bin/claude-evolve",
     "claude-evolve-main": "bin/claude-evolve-main",

package/templates/BRIEF.md CHANGED Viewed

File without changes

package/templates/algorithm.py CHANGED Viewed

File without changes

package/templates/config.yaml CHANGED Viewed

@@ -86,47 +86,49 @@ parallel:
   # Timeout in seconds when waiting for CSV locks
   lock_timeout: 30
-# LLM/AI CLI configuration
+# LLM/AI CLI configuration — quality-triggered escalation system
+# AIDEV-NOTE: Two tiers for coding:
+#   Primary (run): Cheap/open models for normal code generation
+#   Escalation (run_escalation): Big models activated ONLY on syntax/validation failure
+# No fallback tier — if all primary models are down, fail fast so monitoring catches it.
 llm_cli:
-  # What to run for each sub-command
-  # Models are tried in order, with round-robin distribution across candidates
-  # You can repeat models for weighted selection (e.g., "gemini-pro gemini-pro sonnet" for 2:1 ratio)
+  # Models are selected randomly, repeat for weighted selection (e.g., "model model" = 2x weight)
+  # Commented out because defaults change over time; uncomment to override
-  # Default configuration: sonnet at ~11%, rest doubled for cost savings
-  # Commented out because these change over time; uncomment to override
-  #run: gemini-pro gemini-pro glm-5-zai glm-5-zai kimi-coder kimi-coder codex-coding codex-coding sonnet
-  #ideate: opus-think glm-5-zai gemini-pro kimi-coder gpt-5.2 codex-think qwen-openrouter
-  #run_fallback: haiku glm-5-zai gemini-5-flash codex-spark
-  #ideate_fallback: haiku glm-5-zai gemini-5-flash codex-spark
+  # Coding: cheap/open models (flat-rate or low per-token cost)
+  #run: ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder glm-zai qwen-coder minimax
+  # Coding escalation: big commercial models, only used when primary produces bad code
+  #run_escalation: sonnet codex-coding gemini-pro
+  # Ideation: strong thinking models for creative work
+  #ideate: opus-think ollama-glm ollama-glm gemini-pro ollama-qwen ollama-minimax ollama-gemma kimi-coder gpt codex-think glm-zai qwen-coder minimax qwen
+  # Ideation escalation: not currently used
+  #ideate_escalation:
   # Available models:
-  # Claude (subscription-based, watch usage limits):
-  # - sonnet: Claude Sonnet via Claude CLI
-  # - sonnet-think: Claude Sonnet with extended thinking (ultrathink prompt)
-  # - opus: Claude Opus via Claude CLI
-  # - opus-think: Claude Opus with extended thinking (ultrathink prompt)
-  # - haiku: Claude Haiku via Claude CLI (cheap fallback)
   #
-  # Codex/OpenAI (subscription-based):
-  # - codex-think: GPT-5.4 high reasoning effort (ideation)
-  # - codex-coding: GPT-5.4 medium reasoning effort (coding/run)
-  # - codex-spark: GPT-5.1 Codex Mini (lightweight fallback)
-  # - gpt-5.4: GPT-5.4 no reasoning effort override via Codex CLI
-  # - gpt-5.2: GPT-5.2 via Codex CLI
-  # - gpt-5.3-codex: GPT-5.3 Codex (code-specialized) via Codex CLI
-  # - gpt5: GPT-5 via Codex CLI (legacy alias)
-  # - gpt5high: GPT-5 via Codex CLI (high reasoning)
-  # - o3high: O3 via Codex CLI (high reasoning)
+  # --- Cheap/open (primary coding tier) ---
+  # Ollama cloud (flat-rate $20/mo subscription):
+  # - ollama-glm: GLM 5.1 via Ollama cloud
+  # - ollama-qwen: Qwen 3.6 via Ollama cloud
+  # - ollama-minimax: MiniMax M2.7 via Ollama cloud
+  # - ollama-gemma: Gemma 4 31B via Ollama cloud
+  #
+  # Low-cost APIs:
+  # - kimi-coder: Kimi for Coding via kimi CLI (subscription)
+  # - glm-zai: GLM 5 via Z.AI agentic mode
+  # - qwen-coder: Qwen 3 Coder via OpenRouter
+  # - minimax: MiniMax M2.7 via OpenRouter
+  # - qwen: Qwen 3.6 Plus via OpenRouter
   #
-  # Gemini (free tier available):
-  # - gemini-pro: Gemini 3 Pro Preview via Gemini CLI
-  # - gemini-5-flash: Gemini 5 Flash via Gemini CLI (cheap fallback)
-  # - gemini-flash: Gemini 2.5 Flash via Gemini CLI (legacy)
+  # --- Big commercial (escalation tier) ---
+  # - sonnet: Claude Sonnet via Claude CLI
+  # - codex-coding: GPT-5.4 medium reasoning via Codex CLI
+  # - gemini-pro: Gemini 3 Pro via Gemini CLI
   #
-  # Other free/cheap models:
-  # - glm-5-zai: GLM-5 via Z.AI agentic mode
-  # - kimi-coder: Kimi for Coding via kimi CLI (fast, good for code gen)
-  # - kimi-k2-openrouter: Kimi K2 Thinking via OpenRouter
-  # - codex-oss-local: Local model via Codex + Ollama
-  # - cursor-sonnet: Claude Sonnet via Cursor Agent CLI
-  # - cursor-opus: Claude Opus via Cursor Agent CLI
+  # --- Ideation models ---
+  # - opus-think: Claude Opus with extended thinking
+  # - codex-think: GPT-5.5 high reasoning effort
+  # - gpt: GPT-5.5 via Codex CLI

package/templates/evaluator.py CHANGED Viewed

File without changes

package/lib/__pycache__/ai_cli.cpython-310.pyc DELETED Viewed

Binary file

package/lib/__pycache__/embedding.cpython-310.pyc DELETED Viewed

Binary file

package/lib/__pycache__/evolution_csv.cpython-310.pyc DELETED Viewed

Binary file

package/lib/__pycache__/evolve_ideate.cpython-310.pyc DELETED Viewed

Binary file

package/lib/__pycache__/log.cpython-310.pyc DELETED Viewed

Binary file