claude-evolve 1.11.12 → 1.11.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/claude-evolve-check +115 -46
- package/lib/config.sh +5 -4
- package/package.json +1 -1
package/bin/claude-evolve-check
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# claude-evolve-check - Health check for AI model configurations
|
|
3
3
|
# Tests all configured AI models to verify they're working before starting evolution runs
|
|
4
4
|
#
|
|
5
|
-
# AIDEV-NOTE: Claude CLI
|
|
6
|
-
# We
|
|
5
|
+
# AIDEV-NOTE: Claude CLI has issues with timeout command and subshells.
|
|
6
|
+
# We test each model by running it directly and checking the result file.
|
|
7
7
|
|
|
8
8
|
set -e
|
|
9
9
|
|
|
@@ -22,15 +22,16 @@ LIB_DIR="$(dirname "$SCRIPT_DIR")/lib"
|
|
|
22
22
|
source "$LIB_DIR/config.sh"
|
|
23
23
|
load_config
|
|
24
24
|
|
|
25
|
-
# Test timeout in seconds
|
|
25
|
+
# Test timeout in seconds
|
|
26
26
|
TEST_TIMEOUT=30
|
|
27
27
|
|
|
28
28
|
# Simple test prompt
|
|
29
29
|
TEST_PROMPT="Say hello in exactly 3 words."
|
|
30
30
|
|
|
31
|
-
# Temp
|
|
31
|
+
# Temp files
|
|
32
32
|
TEMP_OUTPUT=$(mktemp)
|
|
33
|
-
|
|
33
|
+
TEMP_PID=$(mktemp)
|
|
34
|
+
trap "rm -f $TEMP_OUTPUT $TEMP_PID" EXIT
|
|
34
35
|
|
|
35
36
|
echo -e "${CYAN}🔍 Claude Evolve - AI Model Health Check${NC}"
|
|
36
37
|
echo "============================================"
|
|
@@ -39,12 +40,11 @@ echo
|
|
|
39
40
|
# Collect all unique models into a simple list
|
|
40
41
|
ALL_MODELS=""
|
|
41
42
|
for model in $LLM_RUN $LLM_RUN_FALLBACK $LLM_IDEATE $LLM_IDEATE_FALLBACK; do
|
|
42
|
-
# Check if model already in list
|
|
43
43
|
if ! echo "$ALL_MODELS" | grep -q -w "$model"; then
|
|
44
44
|
ALL_MODELS="$ALL_MODELS $model"
|
|
45
45
|
fi
|
|
46
46
|
done
|
|
47
|
-
ALL_MODELS=$(echo "$ALL_MODELS" | xargs)
|
|
47
|
+
ALL_MODELS=$(echo "$ALL_MODELS" | xargs)
|
|
48
48
|
|
|
49
49
|
echo "Models to test: $ALL_MODELS"
|
|
50
50
|
echo
|
|
@@ -55,78 +55,147 @@ FAILED=0
|
|
|
55
55
|
FAILED_MODELS=""
|
|
56
56
|
TIMEOUT_MODELS=""
|
|
57
57
|
|
|
58
|
-
# Test
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
58
|
+
# Test a single model by writing a tiny test script and running it
|
|
59
|
+
test_model() {
|
|
60
|
+
local model="$1"
|
|
61
|
+
local prompt="$2"
|
|
62
|
+
local outfile="$3"
|
|
63
|
+
local max_wait="$4"
|
|
63
64
|
|
|
64
|
-
#
|
|
65
|
-
|
|
65
|
+
# Write a self-contained test script
|
|
66
|
+
local test_script=$(mktemp)
|
|
66
67
|
case "$model" in
|
|
67
68
|
opus|sonnet|haiku)
|
|
68
|
-
|
|
69
|
-
|
|
69
|
+
cat > "$test_script" << SCRIPT
|
|
70
|
+
#!/usr/bin/env bash
|
|
71
|
+
exec claude --dangerously-skip-permissions --mcp-config '' --model $model -p "\$1"
|
|
72
|
+
SCRIPT
|
|
70
73
|
;;
|
|
71
|
-
opus-think
|
|
72
|
-
|
|
73
|
-
|
|
74
|
+
opus-think)
|
|
75
|
+
cat > "$test_script" << 'SCRIPT'
|
|
76
|
+
#!/usr/bin/env bash
|
|
77
|
+
exec claude --dangerously-skip-permissions --mcp-config '' --model opus -p "ultrathink
|
|
74
78
|
|
|
75
|
-
$
|
|
76
|
-
|
|
79
|
+
$1"
|
|
80
|
+
SCRIPT
|
|
81
|
+
;;
|
|
82
|
+
sonnet-think)
|
|
83
|
+
cat > "$test_script" << 'SCRIPT'
|
|
84
|
+
#!/usr/bin/env bash
|
|
85
|
+
exec claude --dangerously-skip-permissions --mcp-config '' --model sonnet -p "ultrathink
|
|
86
|
+
|
|
87
|
+
$1"
|
|
88
|
+
SCRIPT
|
|
77
89
|
;;
|
|
78
90
|
gemini-pro)
|
|
79
|
-
|
|
80
|
-
|
|
91
|
+
cat > "$test_script" << 'SCRIPT'
|
|
92
|
+
#!/usr/bin/env bash
|
|
93
|
+
exec gemini -y -m gemini-3-pro-preview -p "$1"
|
|
94
|
+
SCRIPT
|
|
81
95
|
;;
|
|
82
96
|
gemini-flash|gemini-3-flash)
|
|
83
|
-
|
|
84
|
-
|
|
97
|
+
cat > "$test_script" << 'SCRIPT'
|
|
98
|
+
#!/usr/bin/env bash
|
|
99
|
+
exec gemini -y -m gemini-2.5-flash -p "$1"
|
|
100
|
+
SCRIPT
|
|
85
101
|
;;
|
|
86
102
|
kimi-coder)
|
|
87
|
-
|
|
88
|
-
|
|
103
|
+
cat > "$test_script" << 'SCRIPT'
|
|
104
|
+
#!/usr/bin/env bash
|
|
105
|
+
exec kimi --print -y -m kimi-for-coding -c "$1"
|
|
106
|
+
SCRIPT
|
|
89
107
|
;;
|
|
90
108
|
kimi-k2.5)
|
|
91
|
-
|
|
92
|
-
|
|
109
|
+
cat > "$test_script" << 'SCRIPT'
|
|
110
|
+
#!/usr/bin/env bash
|
|
111
|
+
exec opencode -m openrouter/moonshotai/kimi-k2.5 run "$1"
|
|
112
|
+
SCRIPT
|
|
93
113
|
;;
|
|
94
114
|
glm-5)
|
|
95
|
-
|
|
96
|
-
|
|
115
|
+
cat > "$test_script" << 'SCRIPT'
|
|
116
|
+
#!/usr/bin/env bash
|
|
117
|
+
exec opencode -m openrouter/z-ai/glm-5 run "$1"
|
|
118
|
+
SCRIPT
|
|
97
119
|
;;
|
|
98
120
|
glm-5-zai)
|
|
99
|
-
|
|
100
|
-
|
|
121
|
+
cat > "$test_script" << 'SCRIPT'
|
|
122
|
+
#!/usr/bin/env bash
|
|
123
|
+
exec opencode -m zai-coding-plan/glm-5 run "$1"
|
|
124
|
+
SCRIPT
|
|
101
125
|
;;
|
|
102
126
|
glm-zai)
|
|
103
|
-
|
|
104
|
-
|
|
127
|
+
cat > "$test_script" << 'SCRIPT'
|
|
128
|
+
#!/usr/bin/env bash
|
|
129
|
+
exec opencode -m zai-coding-plan/glm-4.7 run "$1"
|
|
130
|
+
SCRIPT
|
|
105
131
|
;;
|
|
106
132
|
codex-oss-local)
|
|
107
|
-
|
|
108
|
-
|
|
133
|
+
cat > "$test_script" << 'SCRIPT'
|
|
134
|
+
#!/usr/bin/env bash
|
|
135
|
+
exec codex exec --dangerously-bypass-approvals-and-sandbox --skip-git-repo-check --oss --local-provider=ollama "$1"
|
|
136
|
+
SCRIPT
|
|
109
137
|
;;
|
|
110
138
|
gpt5|gpt5high)
|
|
111
|
-
|
|
112
|
-
|
|
139
|
+
cat > "$test_script" << 'SCRIPT'
|
|
140
|
+
#!/usr/bin/env bash
|
|
141
|
+
exec codex exec --dangerously-bypass-approvals-and-sandbox "$1"
|
|
142
|
+
SCRIPT
|
|
113
143
|
;;
|
|
114
144
|
qwen)
|
|
115
|
-
|
|
116
|
-
|
|
145
|
+
cat > "$test_script" << 'SCRIPT'
|
|
146
|
+
#!/usr/bin/env bash
|
|
147
|
+
exec opencode -m openrouter/qwen/qwen3.5-plus-02-15 run "$1"
|
|
148
|
+
SCRIPT
|
|
117
149
|
;;
|
|
118
150
|
*)
|
|
119
|
-
echo "Unknown model: $model" > "$
|
|
120
|
-
|
|
151
|
+
echo "Unknown model: $model" > "$outfile"
|
|
152
|
+
rm -f "$test_script"
|
|
153
|
+
return 1
|
|
121
154
|
;;
|
|
122
155
|
esac
|
|
156
|
+
chmod +x "$test_script"
|
|
157
|
+
|
|
158
|
+
# Run the test script in background, capture output to file
|
|
159
|
+
"$test_script" "$prompt" > "$outfile" 2>&1 &
|
|
160
|
+
local pid=$!
|
|
161
|
+
|
|
162
|
+
# Poll for completion
|
|
163
|
+
local elapsed=0
|
|
164
|
+
while kill -0 "$pid" 2>/dev/null; do
|
|
165
|
+
if [[ $elapsed -ge $max_wait ]]; then
|
|
166
|
+
kill "$pid" 2>/dev/null
|
|
167
|
+
sleep 1
|
|
168
|
+
kill -9 "$pid" 2>/dev/null
|
|
169
|
+
wait "$pid" 2>/dev/null
|
|
170
|
+
rm -f "$test_script"
|
|
171
|
+
return 124
|
|
172
|
+
fi
|
|
173
|
+
sleep 1
|
|
174
|
+
((elapsed++))
|
|
175
|
+
done
|
|
176
|
+
|
|
177
|
+
wait "$pid"
|
|
178
|
+
local rc=$?
|
|
179
|
+
rm -f "$test_script"
|
|
180
|
+
return $rc
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
# Test each model
|
|
184
|
+
for model in $ALL_MODELS; do
|
|
185
|
+
echo -n "Testing $model... "
|
|
186
|
+
|
|
187
|
+
START_TIME=$(date +%s)
|
|
188
|
+
|
|
189
|
+
set +e
|
|
190
|
+
test_model "$model" "$TEST_PROMPT" "$TEMP_OUTPUT" "$TEST_TIMEOUT"
|
|
191
|
+
EXIT_CODE=$?
|
|
123
192
|
set -e
|
|
124
193
|
|
|
125
194
|
END_TIME=$(date +%s)
|
|
126
195
|
DURATION=$((END_TIME - START_TIME))
|
|
127
196
|
|
|
128
|
-
# Read output
|
|
129
|
-
OUTPUT=$(cat "$TEMP_OUTPUT")
|
|
197
|
+
# Read output
|
|
198
|
+
OUTPUT=$(cat "$TEMP_OUTPUT" 2>/dev/null || true)
|
|
130
199
|
OUTPUT_LEN=${#OUTPUT}
|
|
131
200
|
|
|
132
201
|
# Check result
|
|
@@ -134,7 +203,7 @@ $TEST_PROMPT" > "$TEMP_OUTPUT" 2>&1
|
|
|
134
203
|
echo -e "${GREEN}✓ OK${NC} (${DURATION}s, ${OUTPUT_LEN} chars)"
|
|
135
204
|
((PASSED++))
|
|
136
205
|
elif [[ $EXIT_CODE -eq 124 ]]; then
|
|
137
|
-
echo -e "${RED}✗ TIMEOUT${NC} (${TEST_TIMEOUT}s
|
|
206
|
+
echo -e "${RED}✗ TIMEOUT${NC} (${TEST_TIMEOUT}s)"
|
|
138
207
|
TIMEOUT_MODELS="$TIMEOUT_MODELS $model"
|
|
139
208
|
((FAILED++))
|
|
140
209
|
else
|
package/lib/config.sh
CHANGED
|
@@ -61,18 +61,19 @@ DEFAULT_WORKER_MAX_CANDIDATES=3
|
|
|
61
61
|
# Primary: Strong models used in normal operation
|
|
62
62
|
# Fallback: Cheap/backup models used only when primary tier exhausted
|
|
63
63
|
#
|
|
64
|
-
# Run:
|
|
64
|
+
# Run: Primary models for code generation
|
|
65
|
+
# Sonnet 4.6: 79.6% SWE-bench, $3/$15 per M tokens
|
|
65
66
|
# GLM-5: 744B MoE, $0.80/M tokens, 77.8% SWE-bench
|
|
66
67
|
# Qwen: hybrid linear attention + sparse MoE, strong reasoning
|
|
67
|
-
DEFAULT_LLM_RUN="glm-5 glm-5 qwen kimi-k2.5
|
|
68
|
-
DEFAULT_LLM_RUN_FALLBACK="gemini-3-flash codex-oss-local
|
|
68
|
+
DEFAULT_LLM_RUN="sonnet glm-5 glm-5 qwen kimi-k2.5"
|
|
69
|
+
DEFAULT_LLM_RUN_FALLBACK="gemini-3-flash codex-oss-local"
|
|
69
70
|
#
|
|
70
71
|
# Ideate: Only agentic models that can edit files reliably
|
|
71
72
|
# AIDEV-NOTE: Ideation REQUIRES file editing - non-agentic models (opencode run, codex) return text
|
|
72
73
|
# but don't actually edit files. Only use claude/gemini CLI, cursor-agent, or zai-coding-plan models.
|
|
73
74
|
# OpenRouter models (via opencode) are chat-only and CANNOT edit files for ideation.
|
|
74
75
|
DEFAULT_LLM_IDEATE="opus-think sonnet-think glm-5-zai gemini-pro kimi-coder"
|
|
75
|
-
DEFAULT_LLM_IDEATE_FALLBACK="sonnet glm-zai
|
|
76
|
+
DEFAULT_LLM_IDEATE_FALLBACK="sonnet glm-5-zai"
|
|
76
77
|
|
|
77
78
|
# Load configuration from a YAML file and update variables
|
|
78
79
|
_load_yaml_config() {
|