claude-evolve 1.8.12 → 1.8.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/claude-evolve-ideate +1 -1
- package/bin/claude-evolve-run +87 -68
- package/bin/claude-evolve-worker +28 -6
- package/lib/memory_limit_wrapper.py +152 -64
- package/package.json +1 -1
package/bin/claude-evolve-ideate
CHANGED
|
@@ -1641,7 +1641,7 @@ echo "[INFO] Starting ideation for generation $CURRENT_GENERATION"
|
|
|
1641
1641
|
# Main execution with retry logic and exponential backoff
|
|
1642
1642
|
retry_count=0
|
|
1643
1643
|
wait_seconds=300 # Start with 5 minutes
|
|
1644
|
-
max_wait_seconds=
|
|
1644
|
+
max_wait_seconds=300 # Cap at 5 minutes
|
|
1645
1645
|
|
|
1646
1646
|
while true; do
|
|
1647
1647
|
if [[ $use_strategies == true ]]; then
|
package/bin/claude-evolve-run
CHANGED
|
@@ -275,29 +275,34 @@ count_pending_candidates() {
|
|
|
275
275
|
# Function to get CSV stats
|
|
276
276
|
get_csv_stats() {
|
|
277
277
|
local csv_path="${1:-$FULL_CSV_PATH}"
|
|
278
|
-
|
|
278
|
+
|
|
279
279
|
if [[ ! -f "$csv_path" ]]; then
|
|
280
280
|
echo "[ERROR] CSV not found at: $csv_path" >&2
|
|
281
281
|
echo "0 0 0"
|
|
282
282
|
return
|
|
283
283
|
fi
|
|
284
|
-
|
|
284
|
+
|
|
285
285
|
local total_rows complete_count pending_count
|
|
286
286
|
total_rows=$(wc -l < "$csv_path" | tr -d '[:space:]')
|
|
287
287
|
complete_count=$(grep ',complete' "$csv_path" 2>/dev/null | wc -l | tr -d '[:space:]')
|
|
288
|
-
|
|
288
|
+
|
|
289
289
|
# Count pending using UNIFIED CSV logic
|
|
290
290
|
pending_count=$("$PYTHON_CMD" "$SCRIPT_DIR/../lib/evolution_csv.py" "$csv_path" count)
|
|
291
|
-
|
|
291
|
+
|
|
292
292
|
echo "$total_rows $complete_count $pending_count"
|
|
293
293
|
}
|
|
294
294
|
|
|
295
|
-
|
|
296
|
-
|
|
295
|
+
# Function to perform full CSV cleanup (duplicates, stuck statuses, missing files, etc.)
|
|
296
|
+
cleanup_csv_full() {
|
|
297
|
+
if [[ ! -f "$FULL_CSV_PATH" ]]; then
|
|
298
|
+
echo "[DISPATCHER] No CSV file to clean up" >&2
|
|
299
|
+
return 0
|
|
300
|
+
fi
|
|
301
|
+
|
|
302
|
+
echo "[DISPATCHER] Performing full CSV cleanup..." >&2
|
|
297
303
|
|
|
298
|
-
#
|
|
299
|
-
|
|
300
|
-
echo "[DISPATCHER] Checking for duplicate candidates..."
|
|
304
|
+
# Remove duplicate candidates
|
|
305
|
+
echo "[DISPATCHER] Checking for duplicate candidates..." >&2
|
|
301
306
|
"$PYTHON_CMD" -c "
|
|
302
307
|
import sys
|
|
303
308
|
sys.path.insert(0, '$SCRIPT_DIR/..')
|
|
@@ -312,53 +317,49 @@ except Exception as e:
|
|
|
312
317
|
print(f'[ERROR] Failed to remove duplicates: {e}', file=sys.stderr)
|
|
313
318
|
" 2>&1 || true
|
|
314
319
|
|
|
315
|
-
|
|
320
|
+
# Reset stuck 'running' candidates
|
|
321
|
+
echo "[DISPATCHER] Resetting any stuck 'running' candidates to 'pending'..." >&2
|
|
316
322
|
if "$SCRIPT_DIR/claude-evolve-edit" running pending >/dev/null 2>&1; then
|
|
317
|
-
echo "[DISPATCHER] Successfully reset stuck candidates"
|
|
323
|
+
echo "[DISPATCHER] Successfully reset stuck candidates" >&2
|
|
318
324
|
else
|
|
319
|
-
echo "[DISPATCHER] No stuck candidates found or edit command not available"
|
|
325
|
+
echo "[DISPATCHER] No stuck candidates found or edit command not available" >&2
|
|
320
326
|
fi
|
|
321
|
-
fi
|
|
322
327
|
|
|
323
|
-
#
|
|
324
|
-
|
|
325
|
-
echo "[DISPATCHER] Validating CSV and cleaning up..."
|
|
326
|
-
|
|
327
|
-
# First check for and clean up duplicates
|
|
328
|
-
echo "[DISPATCHER] Checking for duplicate entries..."
|
|
328
|
+
# Clean up duplicates with the dedicated cleanup script
|
|
329
|
+
echo "[DISPATCHER] Checking for duplicate entries..." >&2
|
|
329
330
|
duplicate_check_output=$("$PYTHON_CMD" "$SCRIPT_DIR/claude-evolve-cleanup-duplicates" "$FULL_CSV_PATH" 2>&1)
|
|
330
331
|
if echo "$duplicate_check_output" | grep -q "Found.*duplicate"; then
|
|
331
|
-
echo "[DISPATCHER] WARNING: Duplicate entries detected in CSV!"
|
|
332
|
-
echo "$duplicate_check_output"
|
|
333
|
-
echo "[DISPATCHER] Automatically cleaning up duplicates..."
|
|
332
|
+
echo "[DISPATCHER] WARNING: Duplicate entries detected in CSV!" >&2
|
|
333
|
+
echo "$duplicate_check_output" >&2
|
|
334
|
+
echo "[DISPATCHER] Automatically cleaning up duplicates..." >&2
|
|
334
335
|
if "$PYTHON_CMD" "$SCRIPT_DIR/claude-evolve-cleanup-duplicates" "$FULL_CSV_PATH" --fix; then
|
|
335
|
-
echo "[DISPATCHER] Duplicates cleaned up successfully"
|
|
336
|
+
echo "[DISPATCHER] Duplicates cleaned up successfully" >&2
|
|
336
337
|
else
|
|
337
338
|
echo "[ERROR] Failed to clean up duplicates" >&2
|
|
338
|
-
|
|
339
|
+
return 1
|
|
339
340
|
fi
|
|
340
341
|
else
|
|
341
|
-
echo "[DISPATCHER] No duplicates found"
|
|
342
|
+
echo "[DISPATCHER] No duplicates found" >&2
|
|
342
343
|
fi
|
|
343
|
-
|
|
344
|
-
#
|
|
345
|
-
echo "[DISPATCHER] Checking for invalid entries..."
|
|
344
|
+
|
|
345
|
+
# Clean up invalid entries
|
|
346
|
+
echo "[DISPATCHER] Checking for invalid entries..." >&2
|
|
346
347
|
invalid_check_output=$("$PYTHON_CMD" "$SCRIPT_DIR/claude-evolve-clean-invalid" "$FULL_CSV_PATH" --dry-run 2>&1)
|
|
347
348
|
if echo "$invalid_check_output" | grep -q "Found.*invalid"; then
|
|
348
|
-
echo "[DISPATCHER] WARNING: Invalid entries detected in CSV!"
|
|
349
|
-
echo "$invalid_check_output"
|
|
350
|
-
echo "[DISPATCHER] Automatically cleaning up invalid entries..."
|
|
349
|
+
echo "[DISPATCHER] WARNING: Invalid entries detected in CSV!" >&2
|
|
350
|
+
echo "$invalid_check_output" >&2
|
|
351
|
+
echo "[DISPATCHER] Automatically cleaning up invalid entries..." >&2
|
|
351
352
|
if "$PYTHON_CMD" "$SCRIPT_DIR/claude-evolve-clean-invalid" "$FULL_CSV_PATH"; then
|
|
352
|
-
echo "[DISPATCHER] Invalid entries cleaned up successfully"
|
|
353
|
+
echo "[DISPATCHER] Invalid entries cleaned up successfully" >&2
|
|
353
354
|
else
|
|
354
355
|
echo "[ERROR] Failed to clean up invalid entries" >&2
|
|
355
|
-
|
|
356
|
+
return 1
|
|
356
357
|
fi
|
|
357
358
|
else
|
|
358
|
-
echo "[DISPATCHER] No invalid entries found"
|
|
359
|
+
echo "[DISPATCHER] No invalid entries found" >&2
|
|
359
360
|
fi
|
|
360
|
-
|
|
361
|
-
#
|
|
361
|
+
|
|
362
|
+
# Clean stuck statuses and missing files
|
|
362
363
|
if ! "$PYTHON_CMD" -c "
|
|
363
364
|
import csv
|
|
364
365
|
import sys
|
|
@@ -366,90 +367,88 @@ import os
|
|
|
366
367
|
from pathlib import Path
|
|
367
368
|
|
|
368
369
|
csv_file = '$FULL_CSV_PATH'
|
|
369
|
-
full_output_dir = '$FULL_OUTPUT_DIR'
|
|
370
|
-
script_dir = '$SCRIPT_DIR'
|
|
370
|
+
full_output_dir = '$FULL_OUTPUT_DIR'
|
|
371
|
+
script_dir = '$SCRIPT_DIR'
|
|
371
372
|
|
|
372
373
|
try:
|
|
373
|
-
# Read CSV - let Python's csv module handle all the complexity
|
|
374
374
|
with open(csv_file, 'r') as f:
|
|
375
375
|
reader = csv.reader(f)
|
|
376
376
|
rows = list(reader)
|
|
377
|
-
|
|
377
|
+
|
|
378
378
|
if not rows:
|
|
379
379
|
print('[ERROR] CSV is empty')
|
|
380
380
|
sys.exit(1)
|
|
381
|
-
|
|
382
|
-
# Basic sanity checks
|
|
383
|
-
header = rows[0]
|
|
384
|
-
num_fields = len(header)
|
|
385
|
-
|
|
381
|
+
|
|
386
382
|
if len(rows) == 1:
|
|
387
383
|
print('[INFO] CSV has no data rows (only header)')
|
|
388
|
-
|
|
384
|
+
|
|
389
385
|
changed_count = 0
|
|
390
|
-
|
|
386
|
+
|
|
391
387
|
# Clean up any stuck 'running' statuses
|
|
392
388
|
for i in range(1, len(rows)):
|
|
393
389
|
if len(rows[i]) > 4 and rows[i][4] == 'running':
|
|
394
390
|
rows[i][4] = ''
|
|
395
391
|
changed_count += 1
|
|
396
392
|
|
|
397
|
-
# Reset failed-parent-missing to pending
|
|
393
|
+
# Reset failed-parent-missing to pending - give them another chance
|
|
398
394
|
for i in range(1, len(rows)):
|
|
399
395
|
if len(rows[i]) > 4 and rows[i][4] == 'failed-parent-missing':
|
|
400
396
|
rows[i][4] = 'pending'
|
|
401
397
|
changed_count += 1
|
|
402
|
-
|
|
398
|
+
|
|
403
399
|
# Check for missing Python files for completed/failed candidates
|
|
404
400
|
for i in range(1, len(rows)):
|
|
405
401
|
if len(rows[i]) > 4:
|
|
406
402
|
candidate_id = rows[i][0]
|
|
407
403
|
status = rows[i][4]
|
|
408
|
-
|
|
409
|
-
# Only check if status implies a file should exist
|
|
404
|
+
|
|
410
405
|
if status in ['complete', 'failed', 'failed-ai-retry', 'failed-retry1', 'failed-retry2', 'failed-retry3']:
|
|
411
406
|
expected_file = Path(full_output_dir) / f'evolution_{candidate_id}.py'
|
|
412
407
|
if not expected_file.is_file():
|
|
413
408
|
print(f'[INFO] Detected missing file for {candidate_id} (status: {status}). Resetting to pending.')
|
|
414
|
-
rows[i][4] = 'pending'
|
|
415
|
-
|
|
416
|
-
if len(rows[i]) >
|
|
417
|
-
if len(rows[i]) > 5: rows[i][5] = '' # LLM used for run
|
|
409
|
+
rows[i][4] = 'pending'
|
|
410
|
+
if len(rows[i]) > 3: rows[i][3] = ''
|
|
411
|
+
if len(rows[i]) > 5: rows[i][5] = ''
|
|
418
412
|
changed_count += 1
|
|
419
|
-
|
|
413
|
+
|
|
420
414
|
if changed_count > 0:
|
|
421
|
-
# Write back
|
|
422
415
|
with open(csv_file + '.tmp', 'w', newline='') as f:
|
|
423
416
|
writer = csv.writer(f)
|
|
424
417
|
writer.writerows(rows)
|
|
425
418
|
os.rename(csv_file + '.tmp', csv_file)
|
|
426
419
|
print(f'[INFO] Reset {changed_count} candidates (stuck running or missing files) to pending')
|
|
427
|
-
|
|
428
|
-
# Count pending candidates using UNIFIED logic
|
|
420
|
+
|
|
429
421
|
sys.path.append(script_dir + '/..')
|
|
430
422
|
from lib.evolution_csv import EvolutionCSV
|
|
431
|
-
|
|
423
|
+
|
|
432
424
|
with EvolutionCSV(csv_file) as csv_ops:
|
|
433
|
-
# Auto-fix any corrupted status fields before counting
|
|
434
425
|
fixed = csv_ops.cleanup_corrupted_status_fields()
|
|
435
426
|
if fixed > 0:
|
|
436
427
|
print(f'[INFO] Auto-fixed {fixed} corrupted status field(s)', file=sys.stderr)
|
|
437
428
|
pending = csv_ops.count_pending_candidates()
|
|
438
429
|
|
|
439
430
|
print(f'[INFO] CSV loaded: {len(rows)-1} total candidates, {pending} pending')
|
|
440
|
-
|
|
431
|
+
|
|
441
432
|
except csv.Error as e:
|
|
442
433
|
print(f'[ERROR] CSV parsing error: {e}')
|
|
443
|
-
print('[ERROR] The CSV file appears to be malformed')
|
|
444
434
|
sys.exit(1)
|
|
445
435
|
except Exception as e:
|
|
446
436
|
print(f'[ERROR] Failed to read CSV: {e}')
|
|
447
437
|
sys.exit(1)
|
|
448
438
|
" 2>&1; then
|
|
449
|
-
echo "[ERROR] CSV validation failed
|
|
450
|
-
|
|
439
|
+
echo "[ERROR] CSV validation failed during cleanup" >&2
|
|
440
|
+
return 1
|
|
451
441
|
fi
|
|
452
|
-
|
|
442
|
+
|
|
443
|
+
echo "[DISPATCHER] Full CSV cleanup complete" >&2
|
|
444
|
+
return 0
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
echo "[DISPATCHER] Starting unified evolution engine"
|
|
448
|
+
echo "[DISPATCHER] Configuration: max_workers=$MAX_WORKERS, timeout=${timeout_seconds:-none}"
|
|
449
|
+
|
|
450
|
+
# Perform full CSV cleanup at startup
|
|
451
|
+
cleanup_csv_full || exit 1
|
|
453
452
|
|
|
454
453
|
# Automatic cleanup detection - check for unchanged algorithms and warn user
|
|
455
454
|
echo "[DISPATCHER] Checking for duplicate/unchanged algorithms..."
|
|
@@ -585,8 +584,28 @@ while true; do
|
|
|
585
584
|
|
|
586
585
|
# Check if API limit was reached
|
|
587
586
|
if [[ "$api_limit_reached" == "true" ]]; then
|
|
588
|
-
echo "[DISPATCHER]
|
|
589
|
-
|
|
587
|
+
echo "[DISPATCHER] All AI models hit usage limits" >&2
|
|
588
|
+
echo "[DISPATCHER] Waiting 5 minutes before restarting the run process..." >&2
|
|
589
|
+
|
|
590
|
+
# Wait 5 minutes with countdown
|
|
591
|
+
remaining=300
|
|
592
|
+
while [[ $remaining -gt 0 ]]; do
|
|
593
|
+
if [[ $((remaining % 60)) -eq 0 ]]; then
|
|
594
|
+
echo "[DISPATCHER] Restarting in $((remaining / 60)) minutes..." >&2
|
|
595
|
+
fi
|
|
596
|
+
sleep 60
|
|
597
|
+
remaining=$((remaining - 60))
|
|
598
|
+
done
|
|
599
|
+
|
|
600
|
+
echo "[DISPATCHER] Restarting run process to clear stuck states and retry..." >&2
|
|
601
|
+
# Perform full CSV cleanup to reset stuck states, just like at startup
|
|
602
|
+
cleanup_csv_full || {
|
|
603
|
+
echo "[ERROR] CSV cleanup failed after API limit wait" >&2
|
|
604
|
+
# Continue anyway - better to try than to stop completely
|
|
605
|
+
}
|
|
606
|
+
# Clear the flag and continue - this restarts the main loop
|
|
607
|
+
api_limit_reached=false
|
|
608
|
+
continue
|
|
590
609
|
fi
|
|
591
610
|
|
|
592
611
|
# Periodic cleanup of stuck candidates (every 5 iterations, ~25 seconds)
|
package/bin/claude-evolve-worker
CHANGED
|
@@ -104,7 +104,7 @@ call_ai_for_evolution() {
|
|
|
104
104
|
local retry_count=0
|
|
105
105
|
local max_retries=3
|
|
106
106
|
local wait_seconds=300 # Start with 5 minutes
|
|
107
|
-
local max_wait_seconds=
|
|
107
|
+
local max_wait_seconds=300 # Cap at 5 minutes
|
|
108
108
|
|
|
109
109
|
while true; do
|
|
110
110
|
# Capture file state before AI call
|
|
@@ -407,19 +407,41 @@ with EvolutionCSV('$FULL_CSV_PATH') as csv:
|
|
|
407
407
|
eval_arg=""
|
|
408
408
|
fi
|
|
409
409
|
local eval_cmd=("$PYTHON_CMD" "$FULL_EVALUATOR_PATH" "$eval_arg")
|
|
410
|
-
|
|
410
|
+
|
|
411
411
|
# Add memory limiting if configured
|
|
412
|
+
# CRITICAL: Use multiple layers of protection (ulimit + Python wrapper + monitoring)
|
|
413
|
+
local memory_protection=""
|
|
412
414
|
if [[ -n "$MEMORY_LIMIT_MB" ]] && [[ "$MEMORY_LIMIT_MB" -gt 0 ]]; then
|
|
415
|
+
# Layer 1: ulimit for hard memory limit (kernel-enforced)
|
|
416
|
+
# IMPORTANT: Use -m (RSS) not -v (virtual memory) because:
|
|
417
|
+
# - Neural networks use mmap() which bypasses RLIMIT_AS (-v)
|
|
418
|
+
# - RSS limit is more reliable for actual memory consumption
|
|
419
|
+
# Convert MB to KB for ulimit
|
|
420
|
+
local memory_limit_kb=$((MEMORY_LIMIT_MB * 1024))
|
|
421
|
+
|
|
422
|
+
# Try -m first (RSS limit), fall back to -v if not supported
|
|
423
|
+
if ulimit -m $memory_limit_kb 2>/dev/null; then
|
|
424
|
+
memory_protection="ulimit -m $memory_limit_kb 2>/dev/null; "
|
|
425
|
+
echo "[MEMORY] Layer 1: ulimit -m ${memory_limit_kb}KB (RSS limit - catches neural networks)" >&2
|
|
426
|
+
else
|
|
427
|
+
memory_protection="ulimit -v $memory_limit_kb 2>/dev/null; "
|
|
428
|
+
echo "[MEMORY] Layer 1: ulimit -v ${memory_limit_kb}KB (fallback - may not catch neural networks)" >&2
|
|
429
|
+
fi
|
|
430
|
+
|
|
431
|
+
# Layer 2: Python wrapper with PROCESS TREE monitoring (backup protection)
|
|
413
432
|
eval_cmd=("$PYTHON_CMD" "$SCRIPT_DIR/../lib/memory_limit_wrapper.py" "$MEMORY_LIMIT_MB" "${eval_cmd[@]}")
|
|
433
|
+
|
|
434
|
+
echo "[MEMORY] Layer 2: Python process tree monitoring (kills entire subprocess tree)" >&2
|
|
414
435
|
fi
|
|
415
|
-
|
|
436
|
+
|
|
416
437
|
# Add timeout if configured
|
|
417
438
|
[[ -n "$timeout_seconds" ]] && eval_cmd=(timeout "$timeout_seconds" "${eval_cmd[@]}")
|
|
418
|
-
|
|
419
|
-
# Run evaluation with tee to both display and capture output
|
|
439
|
+
|
|
440
|
+
# Run evaluation with memory protection, tee to both display and capture output
|
|
420
441
|
# Use stdbuf to disable buffering for real-time output
|
|
421
442
|
# IMPORTANT: Use PIPESTATUS to get the exit code of the evaluation command, not tee
|
|
422
|
-
|
|
443
|
+
# The subshell ensures ulimit is applied before the command runs
|
|
444
|
+
stdbuf -o0 -e0 bash -c "${memory_protection}$(printf '%q ' "${eval_cmd[@]}")" 2>&1 | tee "$eval_output_file" >&2
|
|
423
445
|
local eval_exit_code=${PIPESTATUS[0]} # Get exit code of first command in pipe
|
|
424
446
|
|
|
425
447
|
if [[ $eval_exit_code -eq 0 ]]; then
|
|
@@ -4,6 +4,16 @@ Memory-limited execution wrapper for claude-evolve evaluations.
|
|
|
4
4
|
|
|
5
5
|
This script runs a command with memory limits to prevent runaway algorithms
|
|
6
6
|
from consuming all system memory and crashing the machine.
|
|
7
|
+
|
|
8
|
+
CRITICAL: Multi-layer protection approach (both must work together):
|
|
9
|
+
1. ulimit -m (RSS limit) set by calling shell script - kernel-enforced, catches neural networks
|
|
10
|
+
2. This Python wrapper monitors ENTIRE PROCESS TREE every 0.1s and kills if limit exceeded
|
|
11
|
+
|
|
12
|
+
AIDEV-NOTE: Previous bugs fixed:
|
|
13
|
+
- ulimit -v (virtual memory) doesn't catch neural networks that use mmap()
|
|
14
|
+
- Was only monitoring direct child, not entire process tree (missed grandchildren)
|
|
15
|
+
- Monitoring interval was 0.5s - too slow for fast memory allocations
|
|
16
|
+
- Resource limit failures were silently ignored instead of failing fast
|
|
7
17
|
"""
|
|
8
18
|
import sys
|
|
9
19
|
import os
|
|
@@ -11,119 +21,197 @@ import subprocess
|
|
|
11
21
|
import signal
|
|
12
22
|
import time
|
|
13
23
|
import resource
|
|
14
|
-
from typing import Optional
|
|
24
|
+
from typing import Optional, Tuple
|
|
15
25
|
|
|
16
|
-
def
|
|
17
|
-
"""
|
|
26
|
+
def verify_memory_limit_set(limit_mb: int) -> Tuple[bool, str]:
|
|
27
|
+
"""Verify that memory limits are actually enforced."""
|
|
28
|
+
try:
|
|
29
|
+
limit_bytes = limit_mb * 1024 * 1024
|
|
30
|
+
|
|
31
|
+
# Check RLIMIT_AS (virtual memory)
|
|
32
|
+
soft_as, hard_as = resource.getrlimit(resource.RLIMIT_AS)
|
|
33
|
+
if soft_as != resource.RLIM_INFINITY and soft_as <= limit_bytes * 1.1:
|
|
34
|
+
return True, f"RLIMIT_AS set to {soft_as / (1024*1024):.0f}MB"
|
|
35
|
+
|
|
36
|
+
# Check RLIMIT_DATA (data segment)
|
|
37
|
+
try:
|
|
38
|
+
soft_data, hard_data = resource.getrlimit(resource.RLIMIT_DATA)
|
|
39
|
+
if soft_data != resource.RLIM_INFINITY and soft_data <= limit_bytes * 1.1:
|
|
40
|
+
return True, f"RLIMIT_DATA set to {soft_data / (1024*1024):.0f}MB"
|
|
41
|
+
except (OSError, ValueError):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
return False, "No hard memory limits detected"
|
|
45
|
+
except Exception as e:
|
|
46
|
+
return False, f"Error checking limits: {e}"
|
|
47
|
+
|
|
48
|
+
def set_memory_limit(limit_mb: int) -> bool:
|
|
49
|
+
"""
|
|
50
|
+
Set memory limit in MB using resource module.
|
|
51
|
+
Returns True if successful, False otherwise.
|
|
52
|
+
"""
|
|
18
53
|
try:
|
|
19
54
|
# Convert MB to bytes
|
|
20
55
|
limit_bytes = limit_mb * 1024 * 1024
|
|
21
|
-
|
|
56
|
+
|
|
22
57
|
# Set virtual memory limit (address space)
|
|
23
58
|
# On macOS this is the most reliable way to limit memory
|
|
24
59
|
resource.setrlimit(resource.RLIMIT_AS, (limit_bytes, limit_bytes))
|
|
25
|
-
|
|
60
|
+
|
|
26
61
|
# Also try to set data segment limit if available
|
|
27
62
|
try:
|
|
28
63
|
resource.setrlimit(resource.RLIMIT_DATA, (limit_bytes, limit_bytes))
|
|
29
64
|
except (OSError, ValueError):
|
|
30
65
|
# Not available on all systems
|
|
31
66
|
pass
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
67
|
+
|
|
68
|
+
# Verify it was actually set
|
|
69
|
+
is_set, msg = verify_memory_limit_set(limit_mb)
|
|
70
|
+
if is_set:
|
|
71
|
+
print(f"[MEMORY] ✓ Hard limit enforced: {msg}", file=sys.stderr)
|
|
72
|
+
return True
|
|
73
|
+
else:
|
|
74
|
+
print(f"[MEMORY] ✗ Hard limit NOT enforced: {msg}", file=sys.stderr)
|
|
75
|
+
return False
|
|
76
|
+
|
|
35
77
|
except (OSError, ValueError) as e:
|
|
36
|
-
print(f"[MEMORY]
|
|
78
|
+
print(f"[MEMORY] ✗ Could not set memory limit: {e}", file=sys.stderr)
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
def get_process_tree_memory_native(pid: int) -> float:
|
|
82
|
+
"""Get total memory usage of entire process group using native ps command."""
|
|
83
|
+
try:
|
|
84
|
+
# Get the process group ID
|
|
85
|
+
pgid = os.getpgid(pid)
|
|
86
|
+
|
|
87
|
+
# Get all processes in the process group
|
|
88
|
+
ps_result = subprocess.run(
|
|
89
|
+
["ps", "-o", "rss=", "-g", str(pgid)],
|
|
90
|
+
capture_output=True,
|
|
91
|
+
text=True,
|
|
92
|
+
timeout=1
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if ps_result.returncode != 0:
|
|
96
|
+
return 0.0
|
|
97
|
+
|
|
98
|
+
# Sum all RSS values from the process group
|
|
99
|
+
total_rss_kb = 0
|
|
100
|
+
for line in ps_result.stdout.strip().split('\n'):
|
|
101
|
+
line = line.strip()
|
|
102
|
+
if line:
|
|
103
|
+
try:
|
|
104
|
+
total_rss_kb += int(line)
|
|
105
|
+
except ValueError:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Convert KB to MB
|
|
109
|
+
return total_rss_kb / 1024.0
|
|
110
|
+
except Exception:
|
|
111
|
+
return 0.0
|
|
37
112
|
|
|
38
113
|
def monitor_memory_usage_native(process: subprocess.Popen, limit_mb: int) -> Optional[str]:
|
|
39
|
-
"""Monitor
|
|
40
|
-
|
|
41
|
-
|
|
114
|
+
"""Monitor ENTIRE PROCESS TREE memory usage using native tools and kill if it exceeds limits."""
|
|
115
|
+
print(f"[MEMORY] Monitoring process tree from root PID {process.pid} (limit: {limit_mb}MB)", file=sys.stderr)
|
|
116
|
+
|
|
42
117
|
while process.poll() is None:
|
|
43
118
|
try:
|
|
44
|
-
#
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
memory_kb = int(ps_result.stdout.strip())
|
|
55
|
-
memory_mb = memory_kb / 1024
|
|
56
|
-
|
|
57
|
-
# print(f"[MEMORY] PID {process.pid} using {memory_mb:.1f}MB (limit: {limit_mb}MB)", file=sys.stderr)
|
|
58
|
-
|
|
59
|
-
if memory_mb > limit_mb:
|
|
60
|
-
print(f"[MEMORY] Process exceeded {limit_mb}MB limit (using {memory_mb:.1f}MB), terminating", file=sys.stderr)
|
|
61
|
-
# Kill the entire process group - fix race condition
|
|
62
|
-
try:
|
|
63
|
-
pgid = os.getpgid(process.pid)
|
|
64
|
-
os.killpg(pgid, signal.SIGTERM)
|
|
65
|
-
except ProcessLookupError:
|
|
66
|
-
return f"Memory limit exceeded: {memory_mb:.1f}MB > {limit_mb}MB"
|
|
67
|
-
|
|
68
|
-
time.sleep(2) # Give it time to cleanup
|
|
69
|
-
|
|
70
|
-
try:
|
|
71
|
-
if process.poll() is None:
|
|
72
|
-
pgid = os.getpgid(process.pid)
|
|
73
|
-
os.killpg(pgid, signal.SIGKILL)
|
|
74
|
-
except ProcessLookupError:
|
|
75
|
-
pass
|
|
119
|
+
# Get total memory for entire process tree
|
|
120
|
+
memory_mb = get_process_tree_memory_native(process.pid)
|
|
121
|
+
|
|
122
|
+
if memory_mb > limit_mb:
|
|
123
|
+
print(f"[MEMORY] Process tree exceeded {limit_mb}MB limit (using {memory_mb:.1f}MB), terminating entire tree", file=sys.stderr)
|
|
124
|
+
# Kill the entire process group
|
|
125
|
+
try:
|
|
126
|
+
pgid = os.getpgid(process.pid)
|
|
127
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
128
|
+
except ProcessLookupError:
|
|
76
129
|
return f"Memory limit exceeded: {memory_mb:.1f}MB > {limit_mb}MB"
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
130
|
+
|
|
131
|
+
time.sleep(2) # Give it time to cleanup
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
if process.poll() is None:
|
|
135
|
+
pgid = os.getpgid(process.pid)
|
|
136
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
137
|
+
except ProcessLookupError:
|
|
138
|
+
pass
|
|
139
|
+
return f"Memory limit exceeded: {memory_mb:.1f}MB > {limit_mb}MB"
|
|
140
|
+
|
|
141
|
+
time.sleep(0.1) # Check every 100ms for faster response
|
|
142
|
+
|
|
80
143
|
except (subprocess.TimeoutExpired, ValueError, ProcessLookupError):
|
|
81
144
|
# Process might have terminated or ps command failed
|
|
82
|
-
time.sleep(0.
|
|
145
|
+
time.sleep(0.1)
|
|
83
146
|
continue
|
|
84
|
-
|
|
85
|
-
# print(f"[MEMORY] Monitoring stopped for PID {process.pid}", file=sys.stderr)
|
|
147
|
+
|
|
86
148
|
return None
|
|
87
149
|
|
|
150
|
+
def get_process_tree_memory_psutil(ps_process) -> float:
|
|
151
|
+
"""Get total memory usage of entire process tree using psutil."""
|
|
152
|
+
try:
|
|
153
|
+
import psutil
|
|
154
|
+
total_mb = 0.0
|
|
155
|
+
|
|
156
|
+
# Get memory of root process
|
|
157
|
+
try:
|
|
158
|
+
total_mb += ps_process.memory_info().rss / (1024 * 1024)
|
|
159
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
160
|
+
return 0.0
|
|
161
|
+
|
|
162
|
+
# Get memory of all children (recursive)
|
|
163
|
+
try:
|
|
164
|
+
for child in ps_process.children(recursive=True):
|
|
165
|
+
try:
|
|
166
|
+
total_mb += child.memory_info().rss / (1024 * 1024)
|
|
167
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
168
|
+
continue
|
|
169
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
return total_mb
|
|
173
|
+
except ImportError:
|
|
174
|
+
return 0.0
|
|
175
|
+
|
|
88
176
|
def monitor_memory_usage(process: subprocess.Popen, limit_mb: int) -> Optional[str]:
|
|
89
|
-
"""Monitor
|
|
177
|
+
"""Monitor ENTIRE PROCESS TREE memory usage and kill if it exceeds limits."""
|
|
90
178
|
try:
|
|
91
179
|
import psutil
|
|
92
180
|
ps_process = psutil.Process(process.pid)
|
|
93
|
-
|
|
181
|
+
print(f"[MEMORY] Monitoring process tree from root PID {process.pid} (limit: {limit_mb}MB, using psutil)", file=sys.stderr)
|
|
182
|
+
|
|
94
183
|
while process.poll() is None:
|
|
95
184
|
try:
|
|
96
|
-
# Get memory
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
185
|
+
# Get total memory for entire process tree
|
|
186
|
+
memory_mb = get_process_tree_memory_psutil(ps_process)
|
|
187
|
+
|
|
100
188
|
if memory_mb > limit_mb:
|
|
101
|
-
print(f"[MEMORY] Process exceeded {limit_mb}MB limit (using {memory_mb:.1f}MB), terminating", file=sys.stderr)
|
|
102
|
-
# Kill the entire process group
|
|
189
|
+
print(f"[MEMORY] Process tree exceeded {limit_mb}MB limit (using {memory_mb:.1f}MB), terminating entire tree", file=sys.stderr)
|
|
190
|
+
# Kill the entire process group
|
|
103
191
|
try:
|
|
104
192
|
pgid = os.getpgid(process.pid)
|
|
105
193
|
os.killpg(pgid, signal.SIGTERM)
|
|
106
194
|
except ProcessLookupError:
|
|
107
195
|
return f"Memory limit exceeded: {memory_mb:.1f}MB > {limit_mb}MB"
|
|
108
|
-
|
|
196
|
+
|
|
109
197
|
time.sleep(2) # Give it time to cleanup
|
|
110
|
-
|
|
198
|
+
|
|
111
199
|
try:
|
|
112
200
|
if process.poll() is None:
|
|
113
|
-
pgid = os.getpgid(process.pid)
|
|
201
|
+
pgid = os.getpgid(process.pid)
|
|
114
202
|
os.killpg(pgid, signal.SIGKILL)
|
|
115
203
|
except ProcessLookupError:
|
|
116
204
|
pass
|
|
117
205
|
return f"Memory limit exceeded: {memory_mb:.1f}MB > {limit_mb}MB"
|
|
118
|
-
|
|
119
|
-
time.sleep(0.
|
|
206
|
+
|
|
207
|
+
time.sleep(0.1) # Check every 100ms for faster response
|
|
120
208
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
121
209
|
# Process already terminated
|
|
122
210
|
break
|
|
123
211
|
except ImportError:
|
|
124
212
|
# psutil not available, use native monitoring
|
|
125
213
|
return monitor_memory_usage_native(process, limit_mb)
|
|
126
|
-
|
|
214
|
+
|
|
127
215
|
return None
|
|
128
216
|
|
|
129
217
|
def validate_memory_limit(limit_mb: int) -> bool:
|