claude-evolve 1.13.0 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -0
- package/lib/__pycache__/ai_cli.cpython-311.pyc +0 -0
- package/lib/__pycache__/ai_cli.cpython-314.pyc +0 -0
- package/lib/__pycache__/embedding.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolution_csv.cpython-311.pyc +0 -0
- package/lib/__pycache__/evolution_csv.cpython-313.pyc +0 -0
- package/lib/__pycache__/evolution_csv.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolve_ideate.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolve_run.cpython-311.pyc +0 -0
- package/lib/__pycache__/evolve_run.cpython-314.pyc +0 -0
- package/lib/__pycache__/evolve_worker.cpython-314.pyc +0 -0
- package/lib/__pycache__/llm_bandit.cpython-314.pyc +0 -0
- package/lib/__pycache__/log.cpython-311.pyc +0 -0
- package/lib/__pycache__/log.cpython-314.pyc +0 -0
- package/lib/__pycache__/meta_learning.cpython-314.pyc +0 -0
- package/lib/__pycache__/sandbox_wrapper.cpython-314.pyc +0 -0
- package/lib/ai-cli.sh +11 -9
- package/lib/ai_cli.py +64 -34
- package/lib/config.py +0 -0
- package/lib/config.sh +24 -18
- package/lib/csv-lock.sh +0 -0
- package/lib/editor.sh +0 -0
- package/lib/evolution_csv.py +0 -0
- package/lib/evolution_processor.py +0 -0
- package/lib/evolve_ideate.py +0 -0
- package/lib/evolve_worker.py +114 -43
- package/lib/llm_bandit.py +0 -0
- package/lib/log.py +0 -0
- package/lib/meta_learning.py +0 -0
- package/lib/sandbox.sb +0 -0
- package/lib/sandbox_wrapper.py +0 -0
- package/package.json +1 -1
- package/templates/BRIEF.md +0 -0
- package/templates/algorithm.py +0 -0
- package/templates/config.yaml +39 -37
- package/templates/evaluator.py +0 -0
- package/lib/__pycache__/ai_cli.cpython-310.pyc +0 -0
- package/lib/__pycache__/embedding.cpython-310.pyc +0 -0
- package/lib/__pycache__/evolution_csv.cpython-310.pyc +0 -0
- package/lib/__pycache__/evolve_ideate.cpython-310.pyc +0 -0
- package/lib/__pycache__/log.cpython-310.pyc +0 -0
package/README.md
CHANGED
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/ai-cli.sh
CHANGED
|
@@ -53,7 +53,7 @@ EOF
|
|
|
53
53
|
call_ai_model_configured() {
|
|
54
54
|
local model_name="$1"
|
|
55
55
|
local prompt="$2"
|
|
56
|
-
local codex_gpt_model="${CODEX_GPT_MODEL:-${CODEX_GPT5_MODEL:-gpt-5.
|
|
56
|
+
local codex_gpt_model="${CODEX_GPT_MODEL:-${CODEX_GPT5_MODEL:-gpt-5.5}}"
|
|
57
57
|
|
|
58
58
|
# Record start time
|
|
59
59
|
local start_time=$(date +%s)
|
|
@@ -123,7 +123,7 @@ $prompt"
|
|
|
123
123
|
codex-think)
|
|
124
124
|
local ai_output
|
|
125
125
|
# High reasoning - for ideation tasks requiring deep thinking
|
|
126
|
-
ai_output=$(codex exec -m gpt-5.
|
|
126
|
+
ai_output=$(codex exec -m gpt-5.5 -c model_reasoning_effort="high" --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
127
127
|
local ai_exit_code=$?
|
|
128
128
|
;;
|
|
129
129
|
codex-coding)
|
|
@@ -135,7 +135,7 @@ $prompt"
|
|
|
135
135
|
codex-spark)
|
|
136
136
|
local ai_output
|
|
137
137
|
# Cheap/fast lightweight fallback
|
|
138
|
-
ai_output=$(codex exec -m gpt-5.
|
|
138
|
+
ai_output=$(codex exec -m gpt-5.4-mini --dangerously-bypass-approvals-and-sandbox "$prompt" 2>&1)
|
|
139
139
|
local ai_exit_code=$?
|
|
140
140
|
;;
|
|
141
141
|
# --- Gemini (subscription) ---
|
|
@@ -365,19 +365,21 @@ get_models_for_command() {
|
|
|
365
365
|
echo "$model_list"
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
-
# Get
|
|
369
|
-
#
|
|
370
|
-
#
|
|
371
|
-
|
|
368
|
+
# Get escalation models for a specific command (run or ideate)
|
|
369
|
+
# AIDEV-NOTE: Escalation models are big/commercial models used only when
|
|
370
|
+
# cheap primary models produce code with syntax or validation errors.
|
|
371
|
+
# Usage: get_escalation_models_for_command <command>
|
|
372
|
+
# Returns: Space-separated list of escalation model names
|
|
373
|
+
get_escalation_models_for_command() {
|
|
372
374
|
local command="$1"
|
|
373
375
|
local model_list=""
|
|
374
376
|
|
|
375
377
|
case "$command" in
|
|
376
378
|
run)
|
|
377
|
-
model_list="$
|
|
379
|
+
model_list="$LLM_RUN_ESCALATION"
|
|
378
380
|
;;
|
|
379
381
|
ideate)
|
|
380
|
-
model_list="$
|
|
382
|
+
model_list="$LLM_IDEATE_ESCALATION"
|
|
381
383
|
;;
|
|
382
384
|
*)
|
|
383
385
|
echo "[ERROR] Unknown command: $command" >&2
|
package/lib/ai_cli.py
CHANGED
|
@@ -202,22 +202,26 @@ def get_models_for_command(command: str) -> List[str]:
|
|
|
202
202
|
return model_list.split()
|
|
203
203
|
|
|
204
204
|
|
|
205
|
-
def
|
|
205
|
+
def get_escalation_models_for_command(command: str) -> List[str]:
|
|
206
206
|
"""
|
|
207
|
-
Get the list of
|
|
207
|
+
Get the list of escalation models for a command.
|
|
208
|
+
|
|
209
|
+
AIDEV-NOTE: Escalation models are big/commercial models (sonnet, codex, gemini)
|
|
210
|
+
used only when cheap primary models produce code with syntax or validation errors.
|
|
211
|
+
They are NOT for API-down situations — if primary is down, fail fast.
|
|
208
212
|
|
|
209
213
|
Args:
|
|
210
214
|
command: Either "run" or "ideate"
|
|
211
215
|
|
|
212
216
|
Returns:
|
|
213
|
-
List of
|
|
217
|
+
List of escalation model names
|
|
214
218
|
"""
|
|
215
219
|
bash_script = f'''
|
|
216
220
|
source "{SCRIPT_DIR}/config.sh"
|
|
217
221
|
load_config
|
|
218
222
|
case "$1" in
|
|
219
|
-
run) echo "$
|
|
220
|
-
ideate) echo "$
|
|
223
|
+
run) echo "$LLM_RUN_ESCALATION" ;;
|
|
224
|
+
ideate) echo "$LLM_IDEATE_ESCALATION" ;;
|
|
221
225
|
esac
|
|
222
226
|
'''
|
|
223
227
|
|
|
@@ -423,34 +427,30 @@ def call_ai_with_backoff(
|
|
|
423
427
|
env_vars: Optional[dict] = None,
|
|
424
428
|
max_rounds: int = 10,
|
|
425
429
|
initial_wait: int = 60,
|
|
426
|
-
max_wait: int = 600
|
|
427
|
-
use_fallback: bool = True
|
|
430
|
+
max_wait: int = 600
|
|
428
431
|
) -> Tuple[str, str]:
|
|
429
432
|
"""
|
|
430
|
-
Call AI with
|
|
433
|
+
Call AI with round-based retries and exponential backoff.
|
|
431
434
|
|
|
432
|
-
AIDEV-NOTE:
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
3. Fallback models are cheaper/simpler backups (haiku, flash, etc.)
|
|
435
|
+
AIDEV-NOTE: No fallback tier — if all primary models are down, fail fast
|
|
436
|
+
so monitoring catches it. Escalation to big models is handled separately
|
|
437
|
+
by evolve_worker when code quality issues are detected.
|
|
436
438
|
|
|
437
439
|
Args:
|
|
438
440
|
prompt: The prompt to send
|
|
439
441
|
command: "run" or "ideate" - determines model pool
|
|
440
442
|
working_dir: Directory for file operations
|
|
441
443
|
env_vars: Additional environment variables
|
|
442
|
-
max_rounds: Maximum number of full rounds
|
|
444
|
+
max_rounds: Maximum number of full rounds
|
|
443
445
|
initial_wait: Initial wait time in seconds after first failed round
|
|
444
446
|
max_wait: Maximum wait time in seconds between rounds
|
|
445
|
-
use_fallback: Whether to try fallback tier if primary fails
|
|
446
447
|
|
|
447
448
|
Returns:
|
|
448
449
|
Tuple of (output, model_name)
|
|
449
450
|
|
|
450
451
|
Raises:
|
|
451
|
-
AIError: If all
|
|
452
|
+
AIError: If all models exhausted without success
|
|
452
453
|
"""
|
|
453
|
-
# Try primary tier first
|
|
454
454
|
primary_models = get_models_for_command(command)
|
|
455
455
|
if not primary_models:
|
|
456
456
|
raise AIError(f"No primary models configured for command: {command}")
|
|
@@ -463,28 +463,58 @@ def call_ai_with_backoff(
|
|
|
463
463
|
if output is not None:
|
|
464
464
|
return output, model_name
|
|
465
465
|
|
|
466
|
-
#
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
if fallback_models:
|
|
470
|
-
_log(f"Primary tier exhausted, trying {len(fallback_models)} fallback models...")
|
|
466
|
+
# All models exhausted — fail fast, let monitoring catch it
|
|
467
|
+
error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in list(primary_errors.items())[:3])
|
|
468
|
+
raise AIError(f"All primary models exhausted ({max_rounds} rounds). Last errors: {error_summary}")
|
|
471
469
|
|
|
472
|
-
output, model_name, fallback_errors = _try_models_with_backoff(
|
|
473
|
-
prompt, fallback_models, "Fallback",
|
|
474
|
-
working_dir, env_vars, max_rounds, initial_wait, max_wait
|
|
475
|
-
)
|
|
476
470
|
|
|
477
|
-
|
|
478
|
-
|
|
471
|
+
def call_ai_escalation(
|
|
472
|
+
prompt: str,
|
|
473
|
+
command: str = "run",
|
|
474
|
+
working_dir: Optional[str] = None,
|
|
475
|
+
env_vars: Optional[dict] = None
|
|
476
|
+
) -> Tuple[str, str]:
|
|
477
|
+
"""
|
|
478
|
+
Try each escalation model once (no backoff). Used when cheap models produced
|
|
479
|
+
bad code (syntax/validation errors) and we need a smarter model to fix it.
|
|
479
480
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
raise AIError(f"All tiers exhausted ({max_rounds} rounds each). Last errors: {error_summary}")
|
|
481
|
+
AIDEV-NOTE: Quality-triggered escalation. Each model gets one shot.
|
|
482
|
+
If all escalation models fail (rate limits etc), raise AIError — the candidate
|
|
483
|
+
will be marked failed-ai-retry so it can be retried when limits reset.
|
|
484
484
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
485
|
+
Args:
|
|
486
|
+
prompt: The prompt to send (should include error context)
|
|
487
|
+
command: "run" or "ideate" - determines escalation model pool
|
|
488
|
+
working_dir: Directory for file operations
|
|
489
|
+
env_vars: Additional environment variables
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
Tuple of (output, model_name)
|
|
493
|
+
|
|
494
|
+
Raises:
|
|
495
|
+
AIError: If no escalation models available or all failed
|
|
496
|
+
"""
|
|
497
|
+
escalation_models = get_escalation_models_for_command(command)
|
|
498
|
+
if not escalation_models:
|
|
499
|
+
raise AIError(f"No escalation models configured for command: {command}")
|
|
500
|
+
|
|
501
|
+
# Shuffle so we don't always burn the same model first
|
|
502
|
+
models = escalation_models.copy()
|
|
503
|
+
random.shuffle(models)
|
|
504
|
+
|
|
505
|
+
last_errors = {}
|
|
506
|
+
for model in models:
|
|
507
|
+
try:
|
|
508
|
+
_log(f"Escalation: trying {model}...")
|
|
509
|
+
output, model_name = call_ai_model(prompt, model, working_dir, env_vars)
|
|
510
|
+
_log(f"Escalation succeeded with {model}")
|
|
511
|
+
return output, model_name
|
|
512
|
+
except AIError as e:
|
|
513
|
+
_log(f"Escalation {model} failed: {str(e)[:60]}...")
|
|
514
|
+
last_errors[model] = str(e)
|
|
515
|
+
|
|
516
|
+
error_summary = "; ".join(f"{m}: {e[:50]}" for m, e in last_errors.items())
|
|
517
|
+
raise AIError(f"All escalation models failed. Errors: {error_summary}")
|
|
488
518
|
|
|
489
519
|
|
|
490
520
|
def call_ai_for_file_edit(
|
package/lib/config.py
CHANGED
|
File without changes
|
package/lib/config.sh
CHANGED
|
@@ -57,20 +57,22 @@ DEFAULT_MEMORY_LIMIT_MB=12288
|
|
|
57
57
|
# Workers will exit after processing this many candidates to pick up library updates
|
|
58
58
|
DEFAULT_WORKER_MAX_CANDIDATES=3
|
|
59
59
|
|
|
60
|
-
# Default LLM CLI configuration -
|
|
61
|
-
#
|
|
62
|
-
#
|
|
60
|
+
# Default LLM CLI configuration - quality-triggered escalation system
|
|
61
|
+
# AIDEV-NOTE: Two-tier design for coding (run):
|
|
62
|
+
# Primary: Cheap/open models handle normal code generation
|
|
63
|
+
# Escalation: Big commercial models only activated on syntax/validation failure
|
|
64
|
+
# Ideation keeps its own primary list (thinking models for creative work)
|
|
65
|
+
# No fallback tier — if all models are down, fail fast so monitoring catches it.
|
|
63
66
|
#
|
|
64
|
-
# Run:
|
|
65
|
-
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
DEFAULT_LLM_RUN_FALLBACK="haiku ollama-glm ollama-gemma ollama-minimax ollama-qwen glm-zai gemini-cheap codex-spark qwen"
|
|
67
|
+
# Run: Cheap/open models for code generation (flat-rate or low-cost)
|
|
68
|
+
DEFAULT_LLM_RUN="ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder glm-zai qwen-coder minimax"
|
|
69
|
+
# Run escalation: Big models activated only when cheap models produce bad code
|
|
70
|
+
DEFAULT_LLM_RUN_ESCALATION="sonnet codex-coding gemini-pro"
|
|
69
71
|
#
|
|
70
|
-
# Ideate:
|
|
71
|
-
# All CLI tools (opencode, claude, gemini, kimi) are agentic and can edit files
|
|
72
|
+
# Ideate: Strong models for creative ideation
|
|
72
73
|
DEFAULT_LLM_IDEATE="opus-think ollama-glm ollama-glm gemini-pro ollama-qwen ollama-minimax ollama-gemma kimi-coder gpt codex-think glm-zai qwen-coder minimax qwen"
|
|
73
|
-
|
|
74
|
+
# Ideate escalation: not currently used but available for future use
|
|
75
|
+
DEFAULT_LLM_IDEATE_ESCALATION=""
|
|
74
76
|
|
|
75
77
|
# Load configuration from a YAML file and update variables
|
|
76
78
|
_load_yaml_config() {
|
|
@@ -147,12 +149,16 @@ _load_yaml_config() {
|
|
|
147
149
|
lock_timeout) LOCK_TIMEOUT="$value" ;;
|
|
148
150
|
esac
|
|
149
151
|
elif [[ $in_llm_cli_section == true ]]; then
|
|
150
|
-
if [[ $key == "run" || $key == "ideate" || $key == "run_fallback" || $key == "ideate_fallback" ]]; then
|
|
152
|
+
if [[ $key == "run" || $key == "ideate" || $key == "run_escalation" || $key == "ideate_escalation" || $key == "run_fallback" || $key == "ideate_fallback" ]]; then
|
|
151
153
|
case $key in
|
|
152
154
|
run) LLM_RUN="$value" ;;
|
|
153
|
-
|
|
155
|
+
run_escalation) LLM_RUN_ESCALATION="$value" ;;
|
|
156
|
+
# Legacy fallback keys map to escalation for backward compatibility
|
|
157
|
+
run_fallback) LLM_RUN_ESCALATION="$value" ;;
|
|
154
158
|
ideate) LLM_IDEATE="$value" ;;
|
|
155
|
-
|
|
159
|
+
ideate_escalation) LLM_IDEATE_ESCALATION="$value" ;;
|
|
160
|
+
# Legacy fallback keys map to escalation for backward compatibility
|
|
161
|
+
ideate_fallback) LLM_IDEATE_ESCALATION="$value" ;;
|
|
156
162
|
esac
|
|
157
163
|
else
|
|
158
164
|
value=$(echo "$value" | sed "s/^'//;s/'$//")
|
|
@@ -223,9 +229,9 @@ load_config() {
|
|
|
223
229
|
WORKER_MAX_CANDIDATES="$DEFAULT_WORKER_MAX_CANDIDATES"
|
|
224
230
|
|
|
225
231
|
LLM_RUN="$DEFAULT_LLM_RUN"
|
|
226
|
-
|
|
232
|
+
LLM_RUN_ESCALATION="$DEFAULT_LLM_RUN_ESCALATION"
|
|
227
233
|
LLM_IDEATE="$DEFAULT_LLM_IDEATE"
|
|
228
|
-
|
|
234
|
+
LLM_IDEATE_ESCALATION="$DEFAULT_LLM_IDEATE_ESCALATION"
|
|
229
235
|
|
|
230
236
|
# Determine local config file path relative to EVOLUTION_DIR
|
|
231
237
|
local local_config_file="$EVOLUTION_DIR/config.yaml"
|
|
@@ -318,7 +324,7 @@ show_config() {
|
|
|
318
324
|
echo " Memory limit: ${MEMORY_LIMIT_MB}MB"
|
|
319
325
|
echo " Worker max candidates: $WORKER_MAX_CANDIDATES"
|
|
320
326
|
echo " LLM for run: $LLM_RUN"
|
|
321
|
-
echo " LLM for run (
|
|
327
|
+
echo " LLM for run (escalation): $LLM_RUN_ESCALATION"
|
|
322
328
|
echo " LLM for ideate: $LLM_IDEATE"
|
|
323
|
-
echo " LLM for ideate (
|
|
329
|
+
echo " LLM for ideate (escalation): $LLM_IDEATE_ESCALATION"
|
|
324
330
|
}
|
package/lib/csv-lock.sh
CHANGED
|
File without changes
|
package/lib/editor.sh
CHANGED
|
File without changes
|
package/lib/evolution_csv.py
CHANGED
|
File without changes
|
|
File without changes
|
package/lib/evolve_ideate.py
CHANGED
|
File without changes
|
package/lib/evolve_worker.py
CHANGED
|
@@ -35,8 +35,8 @@ set_prefix("WORKER")
|
|
|
35
35
|
|
|
36
36
|
from lib.evolution_csv import EvolutionCSV
|
|
37
37
|
from lib.ai_cli import (
|
|
38
|
-
call_ai_with_backoff,
|
|
39
|
-
get_git_protection_warning, AIError
|
|
38
|
+
call_ai_with_backoff, call_ai_escalation, call_ai_model,
|
|
39
|
+
get_models_for_command, get_git_protection_warning, AIError
|
|
40
40
|
)
|
|
41
41
|
from lib.llm_bandit import LLMBandit
|
|
42
42
|
|
|
@@ -170,11 +170,12 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
|
|
|
170
170
|
|
|
171
171
|
def _call_ai_with_backoff(self, prompt: str, target_file: Path) -> Tuple[bool, str]:
|
|
172
172
|
"""
|
|
173
|
-
Call AI with bandit-based model selection
|
|
173
|
+
Call AI with bandit-based model selection from the primary (cheap) tier.
|
|
174
174
|
|
|
175
175
|
AIDEV-NOTE: First tries model selected by UCB bandit.
|
|
176
|
-
If that fails,
|
|
177
|
-
The bandit learns which models produce better algorithm improvements.
|
|
176
|
+
If that fails, retries with round-robin across all primary models.
|
|
177
|
+
The bandit learns which cheap models produce better algorithm improvements.
|
|
178
|
+
Escalation to big models is handled separately on quality failures.
|
|
178
179
|
|
|
179
180
|
Returns:
|
|
180
181
|
Tuple of (success, model_name)
|
|
@@ -203,14 +204,14 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
|
|
|
203
204
|
else:
|
|
204
205
|
# AIDEV-NOTE: Log output so we can diagnose why file wasn't modified
|
|
205
206
|
preview = output[-300:] if output else "(empty)"
|
|
206
|
-
log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying
|
|
207
|
+
log(f"Bandit model {selected_model} completed but didn't modify file ({len(output)} chars), trying other primary models...")
|
|
207
208
|
log(f"AI output preview: {preview}")
|
|
208
209
|
# AIDEV-NOTE: Report no-modification as failure to bandit
|
|
209
210
|
self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
|
|
210
211
|
log(f"Bandit update: {selected_model} no file modification")
|
|
211
212
|
|
|
212
213
|
except AIError as e:
|
|
213
|
-
log(f"Bandit model {selected_model} failed: {e}, trying
|
|
214
|
+
log(f"Bandit model {selected_model} failed: {e}, trying other primary models...")
|
|
214
215
|
# AIDEV-NOTE: Report AI-level failure to bandit so it learns to avoid broken models
|
|
215
216
|
self.bandit.update(selected_model, child_score=None, parent_score=self._parent_score)
|
|
216
217
|
log(f"Bandit update: {selected_model} AI call failed")
|
|
@@ -244,6 +245,42 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
|
|
|
244
245
|
log_error(f"All AI retries exhausted: {e}")
|
|
245
246
|
return False, ""
|
|
246
247
|
|
|
248
|
+
def _call_ai_escalated(self, prompt: str, target_file: Path) -> Tuple[bool, str]:
|
|
249
|
+
"""
|
|
250
|
+
Call escalation-tier AI (big commercial models) to fix code quality issues.
|
|
251
|
+
|
|
252
|
+
AIDEV-NOTE: Quality-triggered escalation. Only called when cheap primary models
|
|
253
|
+
produced code with syntax or validation errors. Each escalation model gets one
|
|
254
|
+
shot — no backoff loops. If all escalation models fail, the candidate should be
|
|
255
|
+
marked failed-ai-retry (API limits) not failed-validation (bad idea).
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Tuple of (success, model_name)
|
|
259
|
+
"""
|
|
260
|
+
hash_before = self._file_hash(target_file) if target_file.exists() else None
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
output, model = call_ai_escalation(
|
|
264
|
+
prompt,
|
|
265
|
+
command="run",
|
|
266
|
+
working_dir=self.config.evolution_dir
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
hash_after = self._file_hash(target_file) if target_file.exists() else None
|
|
270
|
+
|
|
271
|
+
if hash_before != hash_after and hash_after is not None:
|
|
272
|
+
log(f"Escalation AI successfully modified file (model: {model})")
|
|
273
|
+
return True, model
|
|
274
|
+
else:
|
|
275
|
+
preview = output[-300:] if output else "(empty)"
|
|
276
|
+
log(f"Escalation AI completed but did not modify file ({len(output)} chars)")
|
|
277
|
+
log(f"AI output preview: {preview}")
|
|
278
|
+
return False, model
|
|
279
|
+
|
|
280
|
+
except AIError as e:
|
|
281
|
+
log_error(f"All escalation models failed: {e}")
|
|
282
|
+
return False, ""
|
|
283
|
+
|
|
247
284
|
def _file_hash(self, path: Path) -> Optional[str]:
|
|
248
285
|
"""Get file hash."""
|
|
249
286
|
try:
|
|
@@ -550,59 +587,93 @@ python validator.py {target_basename}
|
|
|
550
587
|
with EvolutionCSV(self.config.csv_path) as csv:
|
|
551
588
|
csv.update_candidate_field(candidate.id, 'run-LLM', model)
|
|
552
589
|
|
|
590
|
+
# AIDEV-NOTE: Quality-triggered escalation system.
|
|
591
|
+
# Phase 1: Check syntax from cheap model output
|
|
592
|
+
# Phase 2: If syntax fails, escalate to big model with error context
|
|
593
|
+
# Phase 3: Validate, if fails escalate to big model with error context
|
|
594
|
+
# If escalation models also fail (API limits), mark failed-ai-retry.
|
|
595
|
+
# If escalation models produce code but it's still bad, mark failed-validation.
|
|
596
|
+
|
|
553
597
|
# Check syntax
|
|
554
598
|
if not self._check_syntax(target_file):
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
599
|
+
log("Syntax error from primary model, escalating to big model...")
|
|
600
|
+
# Get the syntax error details for context
|
|
601
|
+
syntax_result = subprocess.run(
|
|
602
|
+
[self.config.python_cmd, "-m", "py_compile", str(target_file)],
|
|
603
|
+
capture_output=True, text=True
|
|
604
|
+
)
|
|
605
|
+
syntax_error = syntax_result.stderr.strip()
|
|
560
606
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
valid, error_info = self._run_validator(candidate.id)
|
|
607
|
+
fix_prompt = self._build_fix_prompt(
|
|
608
|
+
candidate, target_file.name,
|
|
609
|
+
{'error_type': 'syntax', 'error': syntax_error}
|
|
610
|
+
)
|
|
611
|
+
success, fix_model = self._call_ai_escalated(fix_prompt, target_file)
|
|
567
612
|
|
|
568
|
-
if
|
|
569
|
-
|
|
570
|
-
|
|
613
|
+
if not success:
|
|
614
|
+
log_error("Escalation models failed to fix syntax error")
|
|
615
|
+
target_file.unlink(missing_ok=True)
|
|
616
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
617
|
+
csv.update_candidate_status(candidate.id, 'failed-ai-retry')
|
|
618
|
+
return 77
|
|
571
619
|
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
620
|
+
# Record escalation model
|
|
621
|
+
if fix_model:
|
|
622
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
623
|
+
current_llm = csv.get_candidate_info(candidate.id).get('run-LLM', '')
|
|
624
|
+
new_llm = f"{current_llm}+ESC:{fix_model}" if current_llm else f"ESC:{fix_model}"
|
|
625
|
+
csv.update_candidate_field(candidate.id, 'run-LLM', new_llm)
|
|
626
|
+
|
|
627
|
+
# Re-check syntax after escalation fix
|
|
628
|
+
if not self._check_syntax(target_file):
|
|
629
|
+
log_error("Escalation model also produced syntax error — idea too hard")
|
|
630
|
+
target_file.unlink(missing_ok=True)
|
|
631
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
632
|
+
csv.update_candidate_status(candidate.id, 'failed-validation')
|
|
633
|
+
return 1
|
|
634
|
+
|
|
635
|
+
# Run validator with escalation on failure
|
|
636
|
+
# AIDEV-NOTE: Validator catches structural errors before expensive full evaluation.
|
|
637
|
+
# First attempt uses the code as-is from primary model. On failure, escalate once.
|
|
638
|
+
valid, error_info = self._run_validator(candidate.id)
|
|
575
639
|
|
|
576
|
-
|
|
577
|
-
log(
|
|
640
|
+
if not valid:
|
|
641
|
+
log("Validation failed from primary model, escalating to big model...")
|
|
578
642
|
fix_prompt = self._build_fix_prompt(candidate, target_file.name, error_info)
|
|
579
|
-
success, fix_model = self.
|
|
643
|
+
success, fix_model = self._call_ai_escalated(fix_prompt, target_file)
|
|
580
644
|
|
|
581
645
|
if not success:
|
|
582
|
-
log_error("
|
|
583
|
-
|
|
646
|
+
log_error("Escalation models failed to fix validation error")
|
|
647
|
+
target_file.unlink(missing_ok=True)
|
|
648
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
649
|
+
csv.update_candidate_status(candidate.id, 'failed-ai-retry')
|
|
650
|
+
return 77
|
|
584
651
|
|
|
585
|
-
# Record
|
|
652
|
+
# Record escalation model
|
|
586
653
|
if fix_model:
|
|
587
654
|
with EvolutionCSV(self.config.csv_path) as csv:
|
|
588
655
|
current_llm = csv.get_candidate_info(candidate.id).get('run-LLM', '')
|
|
589
|
-
new_llm = f"{current_llm}+{fix_model}" if current_llm else fix_model
|
|
656
|
+
new_llm = f"{current_llm}+ESC:{fix_model}" if current_llm else f"ESC:{fix_model}"
|
|
590
657
|
csv.update_candidate_field(candidate.id, 'run-LLM', new_llm)
|
|
591
658
|
|
|
592
|
-
#
|
|
659
|
+
# Check syntax after escalation fix (escalation might break it)
|
|
593
660
|
if not self._check_syntax(target_file):
|
|
594
|
-
log_error("
|
|
595
|
-
|
|
661
|
+
log_error("Escalation fix introduced syntax error")
|
|
662
|
+
target_file.unlink(missing_ok=True)
|
|
663
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
664
|
+
csv.update_candidate_status(candidate.id, 'failed-validation')
|
|
665
|
+
return 1
|
|
596
666
|
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
667
|
+
# Re-validate after escalation fix
|
|
668
|
+
valid, error_info = self._run_validator(candidate.id)
|
|
669
|
+
if not valid:
|
|
670
|
+
log_error("Validation still fails after escalation — idea too hard")
|
|
671
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
672
|
+
csv.update_candidate_status(candidate.id, 'failed-validation')
|
|
673
|
+
if error_info:
|
|
674
|
+
error_summary = f"{error_info.get('error_type', 'unknown')}: {error_info.get('error', '')[:100]}"
|
|
675
|
+
csv.update_candidate_field(candidate.id, 'validation_error', error_summary)
|
|
676
|
+
return 1
|
|
606
677
|
|
|
607
678
|
# Run evaluator
|
|
608
679
|
log("Running evaluator...")
|
package/lib/llm_bandit.py
CHANGED
|
File without changes
|
package/lib/log.py
CHANGED
|
File without changes
|
package/lib/meta_learning.py
CHANGED
|
File without changes
|
package/lib/sandbox.sb
CHANGED
|
File without changes
|
package/lib/sandbox_wrapper.py
CHANGED
|
File without changes
|
package/package.json
CHANGED
package/templates/BRIEF.md
CHANGED
|
File without changes
|
package/templates/algorithm.py
CHANGED
|
File without changes
|
package/templates/config.yaml
CHANGED
|
@@ -86,47 +86,49 @@ parallel:
|
|
|
86
86
|
# Timeout in seconds when waiting for CSV locks
|
|
87
87
|
lock_timeout: 30
|
|
88
88
|
|
|
89
|
-
# LLM/AI CLI configuration
|
|
89
|
+
# LLM/AI CLI configuration — quality-triggered escalation system
|
|
90
|
+
# AIDEV-NOTE: Two tiers for coding:
|
|
91
|
+
# Primary (run): Cheap/open models for normal code generation
|
|
92
|
+
# Escalation (run_escalation): Big models activated ONLY on syntax/validation failure
|
|
93
|
+
# No fallback tier — if all primary models are down, fail fast so monitoring catches it.
|
|
90
94
|
llm_cli:
|
|
91
|
-
#
|
|
92
|
-
#
|
|
93
|
-
# You can repeat models for weighted selection (e.g., "gemini-pro gemini-pro sonnet" for 2:1 ratio)
|
|
95
|
+
# Models are selected randomly, repeat for weighted selection (e.g., "model model" = 2x weight)
|
|
96
|
+
# Commented out because defaults change over time; uncomment to override
|
|
94
97
|
|
|
95
|
-
#
|
|
96
|
-
#
|
|
97
|
-
|
|
98
|
-
#
|
|
99
|
-
#
|
|
100
|
-
|
|
98
|
+
# Coding: cheap/open models (flat-rate or low per-token cost)
|
|
99
|
+
#run: ollama-glm ollama-glm ollama-qwen ollama-qwen ollama-minimax ollama-minimax ollama-gemma ollama-gemma kimi-coder kimi-coder glm-zai qwen-coder minimax
|
|
100
|
+
|
|
101
|
+
# Coding escalation: big commercial models, only used when primary produces bad code
|
|
102
|
+
#run_escalation: sonnet codex-coding gemini-pro
|
|
103
|
+
|
|
104
|
+
# Ideation: strong thinking models for creative work
|
|
105
|
+
#ideate: opus-think ollama-glm ollama-glm gemini-pro ollama-qwen ollama-minimax ollama-gemma kimi-coder gpt codex-think glm-zai qwen-coder minimax qwen
|
|
106
|
+
|
|
107
|
+
# Ideation escalation: not currently used
|
|
108
|
+
#ideate_escalation:
|
|
101
109
|
|
|
102
110
|
# Available models:
|
|
103
|
-
# Claude (subscription-based, watch usage limits):
|
|
104
|
-
# - sonnet: Claude Sonnet via Claude CLI
|
|
105
|
-
# - sonnet-think: Claude Sonnet with extended thinking (ultrathink prompt)
|
|
106
|
-
# - opus: Claude Opus via Claude CLI
|
|
107
|
-
# - opus-think: Claude Opus with extended thinking (ultrathink prompt)
|
|
108
|
-
# - haiku: Claude Haiku via Claude CLI (cheap fallback)
|
|
109
111
|
#
|
|
110
|
-
#
|
|
111
|
-
#
|
|
112
|
-
# -
|
|
113
|
-
# -
|
|
114
|
-
# -
|
|
115
|
-
# -
|
|
116
|
-
#
|
|
117
|
-
# -
|
|
118
|
-
# -
|
|
119
|
-
# -
|
|
112
|
+
# --- Cheap/open (primary coding tier) ---
|
|
113
|
+
# Ollama cloud (flat-rate $20/mo subscription):
|
|
114
|
+
# - ollama-glm: GLM 5.1 via Ollama cloud
|
|
115
|
+
# - ollama-qwen: Qwen 3.6 via Ollama cloud
|
|
116
|
+
# - ollama-minimax: MiniMax M2.7 via Ollama cloud
|
|
117
|
+
# - ollama-gemma: Gemma 4 31B via Ollama cloud
|
|
118
|
+
#
|
|
119
|
+
# Low-cost APIs:
|
|
120
|
+
# - kimi-coder: Kimi for Coding via kimi CLI (subscription)
|
|
121
|
+
# - glm-zai: GLM 5 via Z.AI agentic mode
|
|
122
|
+
# - qwen-coder: Qwen 3 Coder via OpenRouter
|
|
123
|
+
# - minimax: MiniMax M2.7 via OpenRouter
|
|
124
|
+
# - qwen: Qwen 3.6 Plus via OpenRouter
|
|
120
125
|
#
|
|
121
|
-
#
|
|
122
|
-
# -
|
|
123
|
-
# -
|
|
124
|
-
# - gemini-
|
|
126
|
+
# --- Big commercial (escalation tier) ---
|
|
127
|
+
# - sonnet: Claude Sonnet via Claude CLI
|
|
128
|
+
# - codex-coding: GPT-5.4 medium reasoning via Codex CLI
|
|
129
|
+
# - gemini-pro: Gemini 3 Pro via Gemini CLI
|
|
125
130
|
#
|
|
126
|
-
#
|
|
127
|
-
# -
|
|
128
|
-
# -
|
|
129
|
-
# -
|
|
130
|
-
# - codex-oss-local: Local model via Codex + Ollama
|
|
131
|
-
# - cursor-sonnet: Claude Sonnet via Cursor Agent CLI
|
|
132
|
-
# - cursor-opus: Claude Opus via Cursor Agent CLI
|
|
131
|
+
# --- Ideation models ---
|
|
132
|
+
# - opus-think: Claude Opus with extended thinking
|
|
133
|
+
# - codex-think: GPT-5.5 high reasoning effort
|
|
134
|
+
# - gpt: GPT-5.5 via Codex CLI
|
package/templates/evaluator.py
CHANGED
|
File without changes
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|