claude-evolve 1.9.8 → 1.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
Binary file
|
package/lib/evolve_worker.py
CHANGED
|
@@ -50,6 +50,7 @@ class Config:
|
|
|
50
50
|
memory_limit_mb: int = 0
|
|
51
51
|
timeout_seconds: int = 600
|
|
52
52
|
max_candidates: int = 5
|
|
53
|
+
max_validation_retries: int = 3 # Max attempts to fix validation errors (if validator.py exists)
|
|
53
54
|
# Retry configuration with exponential backoff
|
|
54
55
|
max_rounds: int = 10
|
|
55
56
|
initial_wait: int = 60
|
|
@@ -202,6 +203,136 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
|
|
|
202
203
|
except Exception:
|
|
203
204
|
return False
|
|
204
205
|
|
|
206
|
+
def _find_validator(self) -> Optional[Path]:
|
|
207
|
+
"""
|
|
208
|
+
Auto-detect validator.py in the evolution directory.
|
|
209
|
+
No config required - if validator.py exists, we use it.
|
|
210
|
+
"""
|
|
211
|
+
validator_path = Path(self.config.evolution_dir) / "validator.py"
|
|
212
|
+
if validator_path.exists():
|
|
213
|
+
return validator_path
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
def _run_validator(self, candidate_id: str) -> Tuple[bool, Dict[str, Any]]:
|
|
217
|
+
"""
|
|
218
|
+
Run the validator (fast smoke test) before full evaluation.
|
|
219
|
+
|
|
220
|
+
AIDEV-NOTE: Auto-detects validator.py in evolution directory.
|
|
221
|
+
Returns exit code 0 on success, non-zero on failure.
|
|
222
|
+
Resilient to any output format - handles JSON, plain text, or nothing.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Tuple of (success, error_info_dict)
|
|
226
|
+
- success: True if validation passed
|
|
227
|
+
- error_info: Dict with whatever info we could extract from output
|
|
228
|
+
"""
|
|
229
|
+
validator_path = self._find_validator()
|
|
230
|
+
if not validator_path:
|
|
231
|
+
return True, {} # No validator found, skip
|
|
232
|
+
|
|
233
|
+
cmd = [self.config.python_cmd, str(validator_path), candidate_id]
|
|
234
|
+
log(f"Running validator: {' '.join(cmd)}")
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
result = subprocess.run(
|
|
238
|
+
cmd,
|
|
239
|
+
capture_output=True,
|
|
240
|
+
text=True,
|
|
241
|
+
timeout=30, # Validator should be fast (~3 seconds)
|
|
242
|
+
cwd=self.config.evolution_dir
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Combine stdout and stderr for full context
|
|
246
|
+
stdout = result.stdout.strip() if result.stdout else ""
|
|
247
|
+
stderr = result.stderr.strip() if result.stderr else ""
|
|
248
|
+
combined_output = f"{stdout}\n{stderr}".strip()
|
|
249
|
+
|
|
250
|
+
# Try to extract structured info, but be resilient to any format
|
|
251
|
+
error_info = {'raw_output': combined_output}
|
|
252
|
+
|
|
253
|
+
# Try to parse JSON from stdout (validator may output JSON)
|
|
254
|
+
if stdout.startswith('{'):
|
|
255
|
+
try:
|
|
256
|
+
parsed = json.loads(stdout)
|
|
257
|
+
if isinstance(parsed, dict):
|
|
258
|
+
error_info.update(parsed)
|
|
259
|
+
except json.JSONDecodeError:
|
|
260
|
+
pass # Not valid JSON, that's fine
|
|
261
|
+
|
|
262
|
+
# If no structured error, use the raw output
|
|
263
|
+
if 'error' not in error_info and combined_output:
|
|
264
|
+
error_info['error'] = combined_output
|
|
265
|
+
|
|
266
|
+
if result.returncode == 0:
|
|
267
|
+
log("Validation passed")
|
|
268
|
+
return True, error_info
|
|
269
|
+
else:
|
|
270
|
+
error_type = error_info.get('error_type', 'validation_failed')
|
|
271
|
+
log_warn(f"Validation failed: {error_type}")
|
|
272
|
+
return False, error_info
|
|
273
|
+
|
|
274
|
+
except subprocess.TimeoutExpired:
|
|
275
|
+
log_error("Validator timed out")
|
|
276
|
+
return False, {'error': 'Validator timed out after 30 seconds', 'error_type': 'timeout'}
|
|
277
|
+
except Exception as e:
|
|
278
|
+
log_error(f"Validator error: {e}")
|
|
279
|
+
return False, {'error': str(e), 'error_type': 'exception'}
|
|
280
|
+
|
|
281
|
+
def _build_fix_prompt(self, candidate: Candidate, target_basename: str, error_info: Dict[str, Any]) -> str:
|
|
282
|
+
"""
|
|
283
|
+
Build AI prompt to fix validation errors.
|
|
284
|
+
|
|
285
|
+
AIDEV-NOTE: Resilient to any error_info structure - uses whatever is available.
|
|
286
|
+
"""
|
|
287
|
+
prompt = f"""{get_git_protection_warning()}
|
|
288
|
+
|
|
289
|
+
The code in {target_basename} failed validation. Please fix the errors and try again.
|
|
290
|
+
|
|
291
|
+
## Validator Output
|
|
292
|
+
|
|
293
|
+
"""
|
|
294
|
+
# Include whatever structured fields we have
|
|
295
|
+
if error_info.get('error_type'):
|
|
296
|
+
prompt += f"**Error Type:** {error_info['error_type']}\n\n"
|
|
297
|
+
|
|
298
|
+
if error_info.get('error'):
|
|
299
|
+
prompt += f"**Error:**\n{error_info['error']}\n\n"
|
|
300
|
+
|
|
301
|
+
if error_info.get('suggestion'):
|
|
302
|
+
prompt += f"**Suggested Fix:**\n{error_info['suggestion']}\n\n"
|
|
303
|
+
|
|
304
|
+
if error_info.get('traceback'):
|
|
305
|
+
tb = error_info['traceback']
|
|
306
|
+
# Truncate if too long
|
|
307
|
+
if len(tb) > 1500:
|
|
308
|
+
tb = "..." + tb[-1500:]
|
|
309
|
+
prompt += f"**Traceback:**\n```\n{tb}\n```\n\n"
|
|
310
|
+
|
|
311
|
+
# If we only have raw output (no structured fields), show that
|
|
312
|
+
if not any(error_info.get(k) for k in ('error', 'error_type', 'suggestion', 'traceback')):
|
|
313
|
+
raw = error_info.get('raw_output', 'No output captured')
|
|
314
|
+
# Truncate if needed
|
|
315
|
+
if len(raw) > 2000:
|
|
316
|
+
raw = raw[:2000] + "\n... (truncated)"
|
|
317
|
+
prompt += f"```\n{raw}\n```\n\n"
|
|
318
|
+
|
|
319
|
+
prompt += f"""## Instructions
|
|
320
|
+
|
|
321
|
+
1. Read the file {target_basename} to understand the current code
|
|
322
|
+
2. Identify the issue based on the validator output above
|
|
323
|
+
3. Fix the code to resolve the validation error
|
|
324
|
+
4. The fix should still implement: {candidate.description}
|
|
325
|
+
|
|
326
|
+
**CRITICAL:** Make sure to actually fix the error. Do not just add comments or make cosmetic changes.
|
|
327
|
+
|
|
328
|
+
To help debug, you can run the validator yourself:
|
|
329
|
+
```
|
|
330
|
+
python validator.py {target_basename}
|
|
331
|
+
```
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
return prompt
|
|
335
|
+
|
|
205
336
|
def _run_evaluator(self, candidate_id: str, is_baseline: bool) -> Tuple[Optional[float], Dict[str, Any]]:
|
|
206
337
|
"""
|
|
207
338
|
Run the evaluator.
|
|
@@ -347,6 +478,52 @@ CRITICAL: If you do not know how to implement what was asked for, or if the requ
|
|
|
347
478
|
csv.update_candidate_status(candidate.id, 'pending')
|
|
348
479
|
return 0 # Will retry
|
|
349
480
|
|
|
481
|
+
# Run validator with retry loop
|
|
482
|
+
# AIDEV-NOTE: Validator catches structural errors before expensive full evaluation.
|
|
483
|
+
# If validation fails, we give the AI feedback and ask it to fix the code.
|
|
484
|
+
validation_passed = False
|
|
485
|
+
for validation_attempt in range(self.config.max_validation_retries + 1):
|
|
486
|
+
valid, error_info = self._run_validator(candidate.id)
|
|
487
|
+
|
|
488
|
+
if valid:
|
|
489
|
+
validation_passed = True
|
|
490
|
+
break
|
|
491
|
+
|
|
492
|
+
if validation_attempt >= self.config.max_validation_retries:
|
|
493
|
+
log_error(f"Validation failed after {self.config.max_validation_retries} fix attempts")
|
|
494
|
+
break
|
|
495
|
+
|
|
496
|
+
# Ask AI to fix the validation error
|
|
497
|
+
log(f"Validation failed (attempt {validation_attempt + 1}), asking AI to fix...")
|
|
498
|
+
fix_prompt = self._build_fix_prompt(candidate, target_file.name, error_info)
|
|
499
|
+
success, fix_model = self._call_ai_with_backoff(fix_prompt, target_file)
|
|
500
|
+
|
|
501
|
+
if not success:
|
|
502
|
+
log_error("AI failed to fix validation error")
|
|
503
|
+
break
|
|
504
|
+
|
|
505
|
+
# Record that we used an additional model call for fixing
|
|
506
|
+
if fix_model:
|
|
507
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
508
|
+
current_llm = csv.get_candidate_info(candidate.id).get('run-LLM', '')
|
|
509
|
+
new_llm = f"{current_llm}+{fix_model}" if current_llm else fix_model
|
|
510
|
+
csv.update_candidate_field(candidate.id, 'run-LLM', new_llm)
|
|
511
|
+
|
|
512
|
+
# Re-check syntax after fix
|
|
513
|
+
if not self._check_syntax(target_file):
|
|
514
|
+
log_error("Fix introduced syntax error")
|
|
515
|
+
# Don't break - try again if we have retries left
|
|
516
|
+
|
|
517
|
+
if not validation_passed:
|
|
518
|
+
# Validation failed after all retries
|
|
519
|
+
with EvolutionCSV(self.config.csv_path) as csv:
|
|
520
|
+
csv.update_candidate_status(candidate.id, 'failed-validation')
|
|
521
|
+
# Store the last error for debugging
|
|
522
|
+
if error_info:
|
|
523
|
+
error_summary = f"{error_info.get('error_type', 'unknown')}: {error_info.get('error', '')[:100]}"
|
|
524
|
+
csv.update_candidate_field(candidate.id, 'validation_error', error_summary)
|
|
525
|
+
return 1
|
|
526
|
+
|
|
350
527
|
# Run evaluator
|
|
351
528
|
log("Running evaluator...")
|
|
352
529
|
score, json_data = self._run_evaluator(candidate.id, is_baseline)
|
|
@@ -470,6 +647,7 @@ def load_config_from_yaml(config_path: Optional[str] = None) -> Config:
|
|
|
470
647
|
memory_limit_mb=data.get('memory_limit_mb', 0),
|
|
471
648
|
timeout_seconds=data.get('timeout_seconds', 600),
|
|
472
649
|
max_candidates=data.get('worker_max_candidates', 5),
|
|
650
|
+
max_validation_retries=data.get('max_validation_retries', 3),
|
|
473
651
|
max_rounds=ideation.get('max_rounds', 10),
|
|
474
652
|
initial_wait=ideation.get('initial_wait', 60),
|
|
475
653
|
max_wait=ideation.get('max_wait', 600)
|