npm - wegho-agentes - Versions diffs - 7.0.3 → 7.0.6 - Mend

wegho-agentes 7.0.3 → 7.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1483) hide show

package/skills/loki-mode/benchmarks/run-benchmarks.sh DELETED Viewed

@@ -1,1948 +0,0 @@
-#!/bin/bash
-#===============================================================================
-# Loki Mode Benchmark Runner
-# Run HumanEval and SWE-bench benchmarks to validate multi-agent performance
-#
-# Usage:
-#   ./benchmarks/run-benchmarks.sh [benchmark] [options]
-#   ./benchmarks/run-benchmarks.sh humaneval              # Setup only
-#   ./benchmarks/run-benchmarks.sh humaneval --execute    # Direct Claude (baseline)
-#   ./benchmarks/run-benchmarks.sh humaneval --execute --loki  # Multi-agent Loki Mode
-#   ./benchmarks/run-benchmarks.sh humaneval --execute --limit 10  # First 10 problems
-#   ./benchmarks/run-benchmarks.sh swebench --execute     # Run SWE-bench
-#   ./benchmarks/run-benchmarks.sh all --execute          # Run all benchmarks
-#
-# Options:
-#   --execute       Actually run problems through Claude (vs just setup)
-#   --loki          Use Loki Mode multi-agent system (Architect->Engineer->QA->Reviewer)
-#   --limit N       Only run first N problems (useful for testing)
-#   --parallel N    Run N problems in parallel (default: 1)
-#   --model MODEL   Claude model to use (default: sonnet)
-#   --timeout N     Timeout per problem in seconds (default: 120)
-#   --retries N     Max RARV retry attempts for --loki mode (default: 3)
-#
-# Prerequisites:
-#   - Python 3.8+
-#   - Claude Code CLI
-#   - Git
-#
-# Results are saved to:
-#   ./benchmarks/results/YYYY-MM-DD-HH-MM-SS/
-#===============================================================================
-set -uo pipefail
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
-RESULTS_DIR="$SCRIPT_DIR/results/$(date +%Y-%m-%d-%H-%M-%S)"
-# Configuration
-EXECUTE_MODE=false
-LOKI_MODE=false  # Use multi-agent Loki Mode vs direct Claude
-PROBLEM_LIMIT=0  # 0 = all problems
-PARALLEL_COUNT=1
-CLAUDE_MODEL="sonnet"
-PROBLEM_TIMEOUT=120
-MAX_RETRIES=3    # RARV retry attempts
-# Colors
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-CYAN='\033[0;36m'
-BLUE='\033[0;34m'
-MAGENTA='\033[0;35m'
-NC='\033[0m'
-log_info() { echo -e "${CYAN}[INFO]${NC} $1"; }
-log_success() { echo -e "${GREEN}[PASS]${NC} $1"; }
-log_warning() { echo -e "${YELLOW}[WARN]${NC} $1"; }
-log_error() { echo -e "${RED}[FAIL]${NC} $1"; }
-log_progress() { echo -e "${BLUE}[PROG]${NC} $1"; }
-#===============================================================================
-# Argument Parsing
-#===============================================================================
-parse_args() {
-    local positional=()
-    while [[ $# -gt 0 ]]; do
-        case $1 in
-            --execute)
-                EXECUTE_MODE=true
-                shift
-                ;;
-            --loki)
-                LOKI_MODE=true
-                shift
-                ;;
-            --limit)
-                PROBLEM_LIMIT="$2"
-                shift 2
-                ;;
-            --parallel)
-                PARALLEL_COUNT="$2"
-                shift 2
-                ;;
-            --model)
-                CLAUDE_MODEL="$2"
-                shift 2
-                ;;
-            --timeout)
-                PROBLEM_TIMEOUT="$2"
-                shift 2
-                ;;
-            --retries)
-                MAX_RETRIES="$2"
-                shift 2
-                ;;
-            -*)
-                log_error "Unknown option: $1"
-                exit 1
-                ;;
-            *)
-                positional+=("$1")
-                shift
-                ;;
-        esac
-    done
-    # Restore positional parameters
-    set -- "${positional[@]}"
-    BENCHMARK="${1:-all}"
-}
-#===============================================================================
-# Setup
-#===============================================================================
-setup_environment() {
-    log_info "Setting up benchmark environment..."
-    mkdir -p "$RESULTS_DIR"
-    mkdir -p "$SCRIPT_DIR/datasets"
-    mkdir -p "$SCRIPT_DIR/workspaces"
-    # Check prerequisites
-    if ! command -v python3 &> /dev/null; then
-        log_error "Python 3 is required"
-        exit 1
-    fi
-    if ! command -v claude &> /dev/null; then
-        log_error "Claude Code CLI is required"
-        exit 1
-    fi
-    # Install benchmark dependencies if needed
-    if [ ! -d "$SCRIPT_DIR/venv" ]; then
-        log_info "Creating virtual environment..."
-        python3 -m venv "$SCRIPT_DIR/venv"
-    fi
-    source "$SCRIPT_DIR/venv/bin/activate"
-    pip install -q requests tqdm
-    log_success "Environment ready"
-}
-#===============================================================================
-# HumanEval Benchmark
-#===============================================================================
-download_humaneval() {
-    local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
-    if [ -f "$dataset_file" ]; then
-        log_info "HumanEval dataset already downloaded"
-        return
-    fi
-    log_info "Downloading HumanEval dataset..."
-    curl -sL "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz" | \
-        gunzip > "$dataset_file"
-    log_success "HumanEval dataset downloaded (164 problems)"
-}
-run_humaneval() {
-    log_info "Running HumanEval benchmark..."
-    download_humaneval
-    if [ "$EXECUTE_MODE" = true ]; then
-        if [ "$LOKI_MODE" = true ]; then
-            run_humaneval_loki
-        else
-            run_humaneval_execute
-        fi
-    else
-        run_humaneval_setup
-    fi
-}
-run_humaneval_setup() {
-    local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
-    local results_file="$RESULTS_DIR/humaneval-results.json"
-    python3 << 'HUMANEVAL_SETUP'
-import json
-import os
-from datetime import datetime
-SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
-RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
-dataset_file = f"{SCRIPT_DIR}/datasets/humaneval.jsonl"
-results_file = f"{RESULTS_DIR}/humaneval-results.json"
-problems = []
-with open(dataset_file, 'r') as f:
-    for line in f:
-        problems.append(json.loads(line))
-print(f"Loaded {len(problems)} HumanEval problems")
-results = {
-    "benchmark": "HumanEval",
-    "version": "1.0",
-    "timestamp": datetime.now().isoformat(),
-    "total_problems": len(problems),
-    "status": "INFRASTRUCTURE_READY",
-    "note": "Run with --execute to run actual tests.",
-    "sample_problems": [p["task_id"] for p in problems[:5]]
-}
-with open(results_file, 'w') as f:
-    json.dump(results, f, indent=2)
-print(f"Results saved to {results_file}")
-print("\nTo run actual benchmarks:")
-print("  ./benchmarks/run-benchmarks.sh humaneval --execute")
-print("  ./benchmarks/run-benchmarks.sh humaneval --execute --limit 10")
-HUMANEVAL_SETUP
-    log_success "HumanEval benchmark infrastructure ready"
-    log_info "Results: $RESULTS_DIR/humaneval-results.json"
-}
-run_humaneval_execute() {
-    local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
-    local results_file="$RESULTS_DIR/humaneval-results.json"
-    local solutions_dir="$RESULTS_DIR/humaneval-solutions"
-    mkdir -p "$solutions_dir"
-    log_info "Executing HumanEval benchmark with Claude..."
-    log_info "Model: $CLAUDE_MODEL | Timeout: ${PROBLEM_TIMEOUT}s | Limit: ${PROBLEM_LIMIT:-all}"
-    # Export variables for Python
-    export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL
-    python3 << 'HUMANEVAL_EXECUTE'
-import json
-import subprocess
-import os
-import sys
-import time
-import tempfile
-import traceback
-from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor, as_completed
-SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
-RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
-PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '0'))
-PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '120'))
-CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
-dataset_file = f"{SCRIPT_DIR}/datasets/humaneval.jsonl"
-results_file = f"{RESULTS_DIR}/humaneval-results.json"
-solutions_dir = f"{RESULTS_DIR}/humaneval-solutions"
-# Load problems
-problems = []
-with open(dataset_file, 'r') as f:
-    for line in f:
-        problems.append(json.loads(line))
-if PROBLEM_LIMIT > 0:
-    problems = problems[:PROBLEM_LIMIT]
-print(f"\n{'='*60}")
-print(f"  HumanEval Benchmark Execution")
-print(f"  Problems: {len(problems)} | Model: {CLAUDE_MODEL}")
-print(f"{'='*60}\n")
-def solve_problem(problem):
-    """Send a HumanEval problem to Claude and get solution."""
-    task_id = problem["task_id"]
-    prompt = problem["prompt"]
-    entry_point = problem["entry_point"]
-    test = problem["test"]
-    canonical = problem.get("canonical_solution", "")
-    # Create prompt for Claude - ask for COMPLETE function to avoid indentation issues
-    claude_prompt = f'''You are solving a HumanEval coding problem. Complete the Python function below.
-{prompt}
-INSTRUCTIONS:
-1. Output the COMPLETE function including the signature and docstring shown above
-2. Fill in the implementation after the docstring
-3. Use proper 4-space indentation for the function body
-4. Output ONLY the Python code - no markdown, no explanation, no ```python blocks
-5. The function must be syntactically valid Python
-Output the complete function now:'''
-    try:
-        # Call Claude
-        result = subprocess.run(
-            ['claude', '-p', claude_prompt, '--model', CLAUDE_MODEL],
-            capture_output=True,
-            text=True,
-            timeout=PROBLEM_TIMEOUT
-        )
-        solution = result.stdout.strip()
-        # Clean up solution - remove markdown code blocks if present
-        if solution.startswith("```python"):
-            solution = solution[9:]
-        if solution.startswith("```"):
-            solution = solution[3:]
-        if solution.endswith("```"):
-            solution = solution[:-3]
-        solution = solution.strip()
-        # Verify solution contains the function definition
-        if f"def {entry_point}" not in solution:
-            # Claude didn't include function signature, prepend it
-            # Indent the body properly
-            lines = solution.split('\n')
-            indented_lines = ['    ' + line if line.strip() and not line.startswith('    ') else line for line in lines]
-            solution = prompt + '\n'.join(indented_lines)
-        return {
-            "task_id": task_id,
-            "solution": solution,
-            "solution_body": solution,
-            "error": None
-        }
-    except subprocess.TimeoutExpired:
-        return {
-            "task_id": task_id,
-            "solution": None,
-            "solution_body": None,
-            "error": "TIMEOUT"
-        }
-    except Exception as e:
-        return {
-            "task_id": task_id,
-            "solution": None,
-            "solution_body": None,
-            "error": str(e)
-        }
-def test_solution(problem, solution):
-    """Execute the solution against HumanEval test cases."""
-    task_id = problem["task_id"]
-    test = problem["test"]
-    entry_point = problem["entry_point"]
-    if solution is None:
-        return {"task_id": task_id, "passed": False, "error": "No solution"}
-    # Create test file
-    test_code = f'''
-{solution}
-{test}
-# Run the check function
-check({entry_point})
-print("PASSED")
-'''
-    try:
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-            f.write(test_code)
-            test_file = f.name
-        result = subprocess.run(
-            ['python3', test_file],
-            capture_output=True,
-            text=True,
-            timeout=30
-        )
-        os.unlink(test_file)
-        passed = "PASSED" in result.stdout
-        return {
-            "task_id": task_id,
-            "passed": passed,
-            "stdout": result.stdout[:500],
-            "stderr": result.stderr[:500] if not passed else "",
-            "error": None
-        }
-    except subprocess.TimeoutExpired:
-        return {"task_id": task_id, "passed": False, "error": "TEST_TIMEOUT"}
-    except Exception as e:
-        return {"task_id": task_id, "passed": False, "error": str(e)}
-# Run benchmark
-results = {
-    "benchmark": "HumanEval",
-    "version": "1.0",
-    "timestamp": datetime.now().isoformat(),
-    "model": CLAUDE_MODEL,
-    "timeout_per_problem": PROBLEM_TIMEOUT,
-    "total_problems": len(problems),
-    "status": "RUNNING",
-    "problems": []
-}
-passed_count = 0
-failed_count = 0
-error_count = 0
-start_time = time.time()
-for i, problem in enumerate(problems):
-    task_id = problem["task_id"]
-    task_num = task_id.split("/")[1]
-    print(f"[{i+1}/{len(problems)}] {task_id}...", end=" ", flush=True)
-    # Get solution from Claude
-    solution_result = solve_problem(problem)
-    if solution_result["error"]:
-        print(f"\033[0;31mERROR: {solution_result['error']}\033[0m")
-        error_count += 1
-        problem_result = {
-            "task_id": task_id,
-            "passed": False,
-            "error": solution_result["error"],
-            "solution": None
-        }
-    else:
-        # Save solution
-        solution_file = f"{solutions_dir}/{task_num}.py"
-        with open(solution_file, 'w') as f:
-            f.write(solution_result["solution"])
-        # Test solution
-        test_result = test_solution(problem, solution_result["solution"])
-        if test_result["passed"]:
-            print(f"\033[0;32mPASSED\033[0m")
-            passed_count += 1
-        else:
-            print(f"\033[0;31mFAILED\033[0m")
-            failed_count += 1
-        problem_result = {
-            "task_id": task_id,
-            "passed": test_result["passed"],
-            "error": test_result.get("error"),
-            "solution_file": solution_file
-        }
-    results["problems"].append(problem_result)
-    # Save intermediate results
-    with open(results_file, 'w') as f:
-        json.dump(results, f, indent=2)
-# Final results
-elapsed_time = time.time() - start_time
-pass_rate = (passed_count / len(problems)) * 100 if problems else 0
-results["status"] = "COMPLETED"
-results["passed"] = passed_count
-results["failed"] = failed_count
-results["errors"] = error_count
-results["pass_rate"] = round(pass_rate, 2)
-results["elapsed_seconds"] = round(elapsed_time, 2)
-with open(results_file, 'w') as f:
-    json.dump(results, f, indent=2)
-print(f"\n{'='*60}")
-print(f"  RESULTS")
-print(f"{'='*60}")
-print(f"  Passed:    {passed_count}/{len(problems)}")
-print(f"  Failed:    {failed_count}/{len(problems)}")
-print(f"  Errors:    {error_count}/{len(problems)}")
-print(f"  Pass Rate: {pass_rate:.1f}%")
-print(f"  Time:      {elapsed_time:.1f}s")
-print(f"{'='*60}\n")
-# Compare to competitors
-print("  Competitor Comparison:")
-print(f"  - MetaGPT:     85.9-87.7%")
-print(f"  - Loki Mode:   {pass_rate:.1f}%")
-if pass_rate >= 85:
-    print(f"  Status: \033[0;32mCOMPETITIVE\033[0m")
-elif pass_rate >= 70:
-    print(f"  Status: \033[0;33mGOOD\033[0m")
-else:
-    print(f"  Status: \033[0;31mNEEDS IMPROVEMENT\033[0m")
-print(f"{'='*60}\n")
-HUMANEVAL_EXECUTE
-    log_success "HumanEval benchmark execution complete"
-    log_info "Results: $results_file"
-    log_info "Solutions: $solutions_dir/"
-}
-#===============================================================================
-# Loki Mode Multi-Agent HumanEval Benchmark
-# Uses: Architect -> Engineer -> QA -> Reviewer with RARV cycle
-#===============================================================================
-run_humaneval_loki() {
-    local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
-    local results_file="$RESULTS_DIR/humaneval-loki-results.json"
-    local solutions_dir="$RESULTS_DIR/humaneval-loki-solutions"
-    mkdir -p "$solutions_dir"
-    log_info "Executing HumanEval with Loki Mode Multi-Agent System..."
-    log_info "Model: $CLAUDE_MODEL | Retries: $MAX_RETRIES | Limit: ${PROBLEM_LIMIT:-all}"
-    log_info "Agents: Architect -> Engineer -> QA -> Reviewer (RARV cycle)"
-    # Export variables for Python
-    export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL MAX_RETRIES
-    python3 << 'HUMANEVAL_LOKI'
-import json
-import subprocess
-import os
-import sys
-import time
-import tempfile
-import traceback
-from datetime import datetime
-SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
-RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
-PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '0'))
-PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '120'))
-CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
-MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '3'))
-dataset_file = f"{SCRIPT_DIR}/datasets/humaneval.jsonl"
-results_file = f"{RESULTS_DIR}/humaneval-loki-results.json"
-solutions_dir = f"{RESULTS_DIR}/humaneval-loki-solutions"
-# Load problems
-problems = []
-with open(dataset_file, 'r') as f:
-    for line in f:
-        problems.append(json.loads(line))
-if PROBLEM_LIMIT > 0:
-    problems = problems[:PROBLEM_LIMIT]
-print(f"\n{'='*70}")
-print(f"  LOKI MODE Multi-Agent HumanEval Benchmark")
-print(f"  Problems: {len(problems)} | Model: {CLAUDE_MODEL} | Max Retries: {MAX_RETRIES}")
-print(f"  Agent Pipeline: Architect -> Engineer -> QA -> Reviewer")
-print(f"{'='*70}\n")
-def call_agent(agent_name, prompt, timeout=PROBLEM_TIMEOUT):
-    """Call a Loki Mode agent with a specific role."""
-    try:
-        result = subprocess.run(
-            ['claude', '-p', prompt, '--model', CLAUDE_MODEL],
-            capture_output=True,
-            text=True,
-            timeout=timeout
-        )
-        return result.stdout.strip(), None
-    except subprocess.TimeoutExpired:
-        return None, "TIMEOUT"
-    except Exception as e:
-        return None, str(e)
-def architect_agent(problem):
-    """Architect: Analyze problem and design approach."""
-    prompt = f'''You are the ARCHITECT AGENT in a multi-agent coding system.
-TASK: Analyze this HumanEval problem and design the solution approach.
-PROBLEM:
-{problem["prompt"]}
-Your job:
-1. Understand what the function should do
-2. Identify edge cases and constraints
-3. Design the algorithm/approach
-4. Note any potential pitfalls
-Output a brief analysis (3-5 lines) with:
-- What the function does
-- Key algorithm/approach
-- Edge cases to handle
-Keep it concise - the Engineer agent will implement based on your analysis.'''
-    return call_agent("Architect", prompt, timeout=30)
-def engineer_agent(problem, architect_analysis):
-    """Engineer: Implement the solution based on architect's design."""
-    prompt = f'''You are the ENGINEER AGENT in a multi-agent coding system.
-TASK: Implement the solution based on the Architect's analysis.
-PROBLEM:
-{problem["prompt"]}
-ARCHITECT'S ANALYSIS:
-{architect_analysis}
-INSTRUCTIONS:
-1. Output the COMPLETE function including signature and docstring
-2. Implement based on the architect's approach
-3. Use proper 4-space indentation
-4. Handle the edge cases identified
-5. Output ONLY Python code - no markdown, no explanation
-Output the complete function now:'''
-    return call_agent("Engineer", prompt)
-def qa_agent(problem, solution):
-    """QA: Test the solution and identify issues."""
-    test = problem["test"]
-    entry_point = problem["entry_point"]
-    # First, actually run the tests
-    test_code = f'''
-{solution}
-{test}
-check({entry_point})
-print("ALL_TESTS_PASSED")
-'''
-    try:
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-            f.write(test_code)
-            temp_file = f.name
-        result = subprocess.run(
-            ['python3', temp_file],
-            capture_output=True,
-            text=True,
-            timeout=10
-        )
-        os.unlink(temp_file)
-        if "ALL_TESTS_PASSED" in result.stdout:
-            return {"passed": True, "output": "All tests passed", "error": None}
-        else:
-            error_msg = result.stderr or result.stdout or "Unknown error"
-            return {"passed": False, "output": error_msg, "error": error_msg}
-    except subprocess.TimeoutExpired:
-        os.unlink(temp_file)
-        return {"passed": False, "output": "Test timeout", "error": "TIMEOUT"}
-    except Exception as e:
-        return {"passed": False, "output": str(e), "error": str(e)}
-def reviewer_agent(problem, solution, qa_result):
-    """Reviewer: Review solution quality and suggest improvements if tests failed."""
-    if qa_result["passed"]:
-        return {"approved": True, "feedback": "Solution passes all tests"}
-    prompt = f'''You are the CODE REVIEWER AGENT in a multi-agent coding system.
-The QA agent found issues with this solution. Analyze and suggest fixes.
-PROBLEM:
-{problem["prompt"]}
-CURRENT SOLUTION:
-{solution}
-TEST ERROR:
-{qa_result["error"]}
-Analyze the error and provide:
-1. What went wrong (1 line)
-2. How to fix it (1-2 lines)
-Keep feedback concise - the Engineer will use it to fix the code.'''
-    feedback, error = call_agent("Reviewer", prompt, timeout=30)
-    return {"approved": False, "feedback": feedback or "No feedback", "error": error}
-def engineer_fix_agent(problem, solution, feedback, attempt):
-    """Engineer: Fix the solution based on reviewer feedback."""
-    prompt = f'''You are the ENGINEER AGENT. Your previous solution failed tests.
-PROBLEM:
-{problem["prompt"]}
-PREVIOUS SOLUTION:
-{solution}
-REVIEWER FEEDBACK:
-{feedback}
-ATTEMPT: {attempt}/{MAX_RETRIES}
-Fix the solution based on the feedback.
-Output the COMPLETE corrected function - no explanations, just code.'''
-    return call_agent("Engineer-Fix", prompt)
-def solve_with_loki_mode(problem):
-    """
-    Solve a HumanEval problem using Loki Mode multi-agent system.
-    Pipeline: Architect -> Engineer -> QA -> [Reviewer -> Engineer-Fix]* -> Pass/Fail
-    """
-    task_id = problem["task_id"]
-    entry_point = problem["entry_point"]
-    agent_trace = []
-    # Step 1: Architect analyzes the problem
-    architect_analysis, error = architect_agent(problem)
-    agent_trace.append({"agent": "Architect", "output": architect_analysis, "error": error})
-    if error:
-        return {
-            "task_id": task_id,
-            "solution": None,
-            "passed": False,
-            "error": f"Architect failed: {error}",
-            "attempts": 1,
-            "agent_trace": agent_trace
-        }
-    # Step 2: Engineer implements solution
-    solution, error = engineer_agent(problem, architect_analysis)
-    agent_trace.append({"agent": "Engineer", "output": solution[:200] if solution else None, "error": error})
-    if error or not solution:
-        return {
-            "task_id": task_id,
-            "solution": None,
-            "passed": False,
-            "error": f"Engineer failed: {error}",
-            "attempts": 1,
-            "agent_trace": agent_trace
-        }
-    # Clean up solution
-    if solution.startswith("```python"):
-        solution = solution[9:]
-    if solution.startswith("```"):
-        solution = solution[3:]
-    if solution.endswith("```"):
-        solution = solution[:-3]
-    solution = solution.strip()
-    # Ensure function signature is present
-    if f"def {entry_point}" not in solution:
-        lines = solution.split('\n')
-        indented_lines = ['    ' + line if line.strip() and not line.startswith('    ') else line for line in lines]
-        solution = problem["prompt"] + '\n'.join(indented_lines)
-    # RARV Loop: QA -> Reviewer -> Engineer-Fix
-    for attempt in range(1, MAX_RETRIES + 1):
-        # Step 3: QA tests the solution
-        qa_result = qa_agent(problem, solution)
-        agent_trace.append({"agent": "QA", "passed": qa_result["passed"], "error": qa_result.get("error")})
-        if qa_result["passed"]:
-            return {
-                "task_id": task_id,
-                "solution": solution,
-                "passed": True,
-                "error": None,
-                "attempts": attempt,
-                "agent_trace": agent_trace
-            }
-        if attempt >= MAX_RETRIES:
-            break
-        # Step 4: Reviewer analyzes failure
-        review = reviewer_agent(problem, solution, qa_result)
-        agent_trace.append({"agent": "Reviewer", "feedback": review["feedback"][:200] if review["feedback"] else None})
-        # Step 5: Engineer fixes based on feedback
-        new_solution, error = engineer_fix_agent(problem, solution, review["feedback"], attempt + 1)
-        agent_trace.append({"agent": f"Engineer-Fix-{attempt+1}", "output": new_solution[:200] if new_solution else None, "error": error})
-        if new_solution and not error:
-            # Clean up
-            if new_solution.startswith("```python"):
-                new_solution = new_solution[9:]
-            if new_solution.startswith("```"):
-                new_solution = new_solution[3:]
-            if new_solution.endswith("```"):
-                new_solution = new_solution[:-3]
-            new_solution = new_solution.strip()
-            if f"def {entry_point}" not in new_solution:
-                lines = new_solution.split('\n')
-                indented_lines = ['    ' + line if line.strip() and not line.startswith('    ') else line for line in lines]
-                new_solution = problem["prompt"] + '\n'.join(indented_lines)
-            solution = new_solution
-    return {
-        "task_id": task_id,
-        "solution": solution,
-        "passed": False,
-        "error": f"Failed after {MAX_RETRIES} RARV attempts",
-        "attempts": MAX_RETRIES,
-        "agent_trace": agent_trace
-    }
-# Run benchmark
-results = {
-    "benchmark": "HumanEval-LokiMode",
-    "mode": "multi-agent",
-    "version": "1.0",
-    "timestamp": datetime.now().isoformat(),
-    "model": CLAUDE_MODEL,
-    "max_retries": MAX_RETRIES,
-    "total_problems": len(problems),
-    "problems": []
-}
-start_time = time.time()
-passed_count = 0
-failed_count = 0
-error_count = 0
-total_attempts = 0
-for i, problem in enumerate(problems):
-    task_id = problem["task_id"]
-    task_num = int(task_id.split("/")[1])
-    print(f"[{i+1}/{len(problems)}] {task_id}...", end=" ", flush=True)
-    problem_result = solve_with_loki_mode(problem)
-    # Save solution
-    solution_file = f"{solutions_dir}/{task_num}.py"
-    with open(solution_file, 'w') as f:
-        f.write(f"# {task_id}\n")
-        f.write(f"# Loki Mode Multi-Agent Solution\n")
-        f.write(f"# Attempts: {problem_result['attempts']}\n")
-        f.write(f"# Passed: {problem_result['passed']}\n\n")
-        if problem_result["solution"]:
-            f.write(problem_result["solution"])
-    # Track results
-    total_attempts += problem_result["attempts"]
-    if problem_result["passed"]:
-        passed_count += 1
-        attempts_str = f"(attempt {problem_result['attempts']})" if problem_result['attempts'] > 1 else ""
-        print(f"\033[0;32mPASSED\033[0m {attempts_str}")
-    elif problem_result["error"] and "failed" in problem_result["error"].lower():
-        error_count += 1
-        print(f"\033[0;31mERROR\033[0m - {problem_result['error'][:50]}")
-    else:
-        failed_count += 1
-        print(f"\033[0;33mFAILED\033[0m after {problem_result['attempts']} attempts")
-    # Store result (without full trace to save space)
-    results["problems"].append({
-        "task_id": task_id,
-        "passed": problem_result["passed"],
-        "attempts": problem_result["attempts"],
-        "error": problem_result.get("error")
-    })
-elapsed_time = time.time() - start_time
-# Final results
-results["passed"] = passed_count
-results["failed"] = failed_count
-results["errors"] = error_count
-results["pass_rate"] = (passed_count / len(problems)) * 100 if problems else 0
-results["avg_attempts"] = total_attempts / len(problems) if problems else 0
-results["elapsed_time"] = elapsed_time
-with open(results_file, 'w') as f:
-    json.dump(results, f, indent=2)
-pass_rate = results["pass_rate"]
-avg_attempts = results["avg_attempts"]
-print(f"\n{'='*70}")
-print(f"  LOKI MODE RESULTS")
-print(f"{'='*70}")
-print(f"  Passed:       {passed_count}/{len(problems)} ({pass_rate:.1f}%)")
-print(f"  Failed:       {failed_count}/{len(problems)}")
-print(f"  Errors:       {error_count}/{len(problems)}")
-print(f"  Avg Attempts: {avg_attempts:.2f}")
-print(f"  Time:         {elapsed_time:.1f}s ({elapsed_time/len(problems):.1f}s avg)")
-print(f"{'='*70}")
-print(f"\n  Comparison (baseline: MetaGPT 85.9-87.7%):")
-print(f"  - MetaGPT (multi-agent):     85.9-87.7%")
-print(f"  - Direct Claude:             98.17% (from previous run)")
-print(f"  - Loki Mode (multi-agent):   {pass_rate:.1f}%")
-if pass_rate >= 98:
-    print(f"  Status: \033[0;32mEXCELLENT - Beats both!\033[0m")
-elif pass_rate >= 90:
-    print(f"  Status: \033[0;32mGREAT - Beats MetaGPT\033[0m")
-elif pass_rate >= 85:
-    print(f"  Status: \033[0;33mCOMPETITIVE with MetaGPT\033[0m")
-else:
-    print(f"  Status: \033[0;31mBELOW MetaGPT baseline\033[0m")
-print(f"{'='*70}\n")
-HUMANEVAL_LOKI
-    log_success "Loki Mode HumanEval benchmark complete"
-    log_info "Results: $results_file"
-    log_info "Solutions: $solutions_dir/"
-}
-#===============================================================================
-# SWE-bench Benchmark
-#===============================================================================
-download_swebench() {
-    local dataset_file="$SCRIPT_DIR/datasets/swebench-lite.json"
-    if [ -f "$dataset_file" ]; then
-        log_info "SWE-bench Lite dataset already downloaded"
-        return
-    fi
-    log_info "Downloading SWE-bench Lite dataset..."
-    python3 << 'SWEBENCH_DOWNLOAD'
-import json
-import os
-SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
-# Create placeholder dataset structure
-dataset = {
-    "name": "SWE-bench Lite",
-    "version": "1.0",
-    "description": "300 real-world GitHub issues for evaluation",
-    "source": "https://github.com/SWE-bench/SWE-bench",
-    "problems": 300,
-    "status": "PLACEHOLDER",
-    "install_command": "pip install swebench",
-    "run_command": "python -m swebench.harness.run_evaluation"
-}
-with open(f"{SCRIPT_DIR}/datasets/swebench-lite.json", 'w') as f:
-    json.dump(dataset, f, indent=2)
-print("SWE-bench Lite metadata saved")
-SWEBENCH_DOWNLOAD
-    log_success "SWE-bench Lite dataset metadata ready"
-}
-run_swebench() {
-    log_info "Running SWE-bench Lite benchmark..."
-    download_swebench
-    if [ "$EXECUTE_MODE" = true ]; then
-        if [ "$LOKI_MODE" = true ]; then
-            run_swebench_loki
-        else
-            run_swebench_execute
-        fi
-    else
-        run_swebench_setup
-    fi
-}
-run_swebench_setup() {
-    local results_file="$RESULTS_DIR/swebench-results.json"
-    python3 << 'SWEBENCH_SETUP'
-import json
-import os
-from datetime import datetime
-RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
-results = {
-    "benchmark": "SWE-bench Lite",
-    "version": "1.0",
-    "timestamp": datetime.now().isoformat(),
-    "total_problems": 300,
-    "status": "INFRASTRUCTURE_READY",
-    "note": "Install swebench package for full evaluation.",
-    "install": "pip install swebench",
-    "evaluation": "python -m swebench.harness.run_evaluation --predictions predictions.json"
-}
-with open(f"{RESULTS_DIR}/swebench-results.json", 'w') as f:
-    json.dump(results, f, indent=2)
-print(f"Results saved to {RESULTS_DIR}/swebench-results.json")
-SWEBENCH_SETUP
-    log_success "SWE-bench benchmark infrastructure ready"
-    log_info "Results: $RESULTS_DIR/swebench-results.json"
-}
-run_swebench_execute() {
-    log_info "Executing SWE-bench Lite benchmark..."
-    # Check if swebench is installed
-    if ! python3 -c "import swebench" 2>/dev/null; then
-        log_warning "SWE-bench package not installed. Installing..."
-        pip install -q swebench datasets
-    fi
-    export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL
-    python3 << 'SWEBENCH_EXECUTE'
-import json
-import subprocess
-import os
-import sys
-import time
-import tempfile
-import shutil
-from datetime import datetime
-try:
-    from datasets import load_dataset
-    from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
-except ImportError:
-    print("Installing SWE-bench dependencies...")
-    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'swebench', 'datasets'])
-    from datasets import load_dataset
-SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
-RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
-PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '10'))  # Default to 10 for SWE-bench
-PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '300'))
-CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
-results_file = f"{RESULTS_DIR}/swebench-results.json"
-patches_dir = f"{RESULTS_DIR}/swebench-patches"
-os.makedirs(patches_dir, exist_ok=True)
-print(f"\n{'='*60}")
-print(f"  SWE-bench Lite Benchmark Execution")
-print(f"  Limit: {PROBLEM_LIMIT} | Model: {CLAUDE_MODEL}")
-print(f"{'='*60}\n")
-# Load SWE-bench Lite dataset
-print("Loading SWE-bench Lite dataset...")
-try:
-    dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
-    problems = list(dataset)[:PROBLEM_LIMIT]
-    print(f"Loaded {len(problems)} problems")
-except Exception as e:
-    print(f"Error loading dataset: {e}")
-    print("Using placeholder results...")
-    results = {
-        "benchmark": "SWE-bench Lite",
-        "version": "1.0",
-        "timestamp": datetime.now().isoformat(),
-        "status": "DATASET_ERROR",
-        "error": str(e),
-        "note": "Could not load SWE-bench dataset. Check network and try again."
-    }
-    with open(results_file, 'w') as f:
-        json.dump(results, f, indent=2)
-    sys.exit(1)
-def solve_swebench_problem(problem):
-    """Generate a patch for a SWE-bench problem using Claude."""
-    instance_id = problem["instance_id"]
-    repo = problem["repo"]
-    base_commit = problem["base_commit"]
-    problem_statement = problem["problem_statement"]
-    hints = problem.get("hints_text", "")
-    # Create prompt for Claude
-    prompt = f'''You are solving a real GitHub issue from the {repo} repository.
-## Problem Statement
-{problem_statement}
-## Hints
-{hints if hints else "No hints available."}
-## Task
-Generate a git patch (unified diff format) that fixes this issue.
-Output ONLY the patch content in unified diff format. Example format:
---- a/file.py
-+++ b/file.py
-@@ -10,6 +10,7 @@
- existing line
-+new line
- existing line
-Do not include any explanation or markdown code blocks. Just the raw patch.'''
-    try:
-        result = subprocess.run(
-            ['claude', '-p', prompt, '--model', CLAUDE_MODEL],
-            capture_output=True,
-            text=True,
-            timeout=PROBLEM_TIMEOUT
-        )
-        patch = result.stdout.strip()
-        # Clean up patch if wrapped in markdown
-        if patch.startswith("```"):
-            lines = patch.split("\n")
-            patch = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
-        return {
-            "instance_id": instance_id,
-            "model_patch": patch,
-            "error": None
-        }
-    except subprocess.TimeoutExpired:
-        return {"instance_id": instance_id, "model_patch": None, "error": "TIMEOUT"}
-    except Exception as e:
-        return {"instance_id": instance_id, "model_patch": None, "error": str(e)}
-# Run benchmark
-results = {
-    "benchmark": "SWE-bench Lite",
-    "version": "1.0",
-    "timestamp": datetime.now().isoformat(),
-    "model": CLAUDE_MODEL,
-    "timeout_per_problem": PROBLEM_TIMEOUT,
-    "total_problems": len(problems),
-    "status": "RUNNING",
-    "predictions": []
-}
-generated_count = 0
-error_count = 0
-start_time = time.time()
-for i, problem in enumerate(problems):
-    instance_id = problem["instance_id"]
-    print(f"[{i+1}/{len(problems)}] {instance_id}...", end=" ", flush=True)
-    solution = solve_swebench_problem(problem)
-    if solution["error"]:
-        print(f"\033[0;31mERROR: {solution['error']}\033[0m")
-        error_count += 1
-    else:
-        print(f"\033[0;32mGENERATED\033[0m")
-        generated_count += 1
-        # Save patch
-        patch_file = f"{patches_dir}/{instance_id.replace('/', '_')}.patch"
-        with open(patch_file, 'w') as f:
-            f.write(solution["model_patch"])
-    # Add to predictions (format required by SWE-bench evaluator)
-    results["predictions"].append({
-        "instance_id": instance_id,
-        "model_patch": solution["model_patch"] or "",
-        "model_name_or_path": f"loki-mode-{CLAUDE_MODEL}"
-    })
-    # Save intermediate results
-    with open(results_file, 'w') as f:
-        json.dump(results, f, indent=2)
-# Save predictions file for SWE-bench evaluator
-predictions_file = f"{RESULTS_DIR}/swebench-predictions.json"
-with open(predictions_file, 'w') as f:
-    json.dump(results["predictions"], f, indent=2)
-elapsed_time = time.time() - start_time
-results["status"] = "PATCHES_GENERATED"
-results["generated"] = generated_count
-results["errors"] = error_count
-results["elapsed_seconds"] = round(elapsed_time, 2)
-results["predictions_file"] = predictions_file
-results["next_step"] = "Run: python -m swebench.harness.run_evaluation --predictions " + predictions_file
-with open(results_file, 'w') as f:
-    json.dump(results, f, indent=2)
-print(f"\n{'='*60}")
-print(f"  RESULTS")
-print(f"{'='*60}")
-print(f"  Generated: {generated_count}/{len(problems)}")
-print(f"  Errors:    {error_count}/{len(problems)}")
-print(f"  Time:      {elapsed_time:.1f}s")
-print(f"{'='*60}")
-print(f"\n  Next Step: Run SWE-bench evaluator")
-print(f"  python -m swebench.harness.run_evaluation \\")
-print(f"    --predictions {predictions_file} \\")
-print(f"    --max_workers 4")
-print(f"{'='*60}\n")
-SWEBENCH_EXECUTE
-    log_success "SWE-bench patch generation complete"
-    log_info "Results: $RESULTS_DIR/swebench-results.json"
-    log_info "Predictions: $RESULTS_DIR/swebench-predictions.json"
-}
-#===============================================================================
-# Loki Mode Multi-Agent SWE-bench Benchmark
-# Uses: Architect -> Engineer -> QA -> Reviewer with RARV cycle
-#===============================================================================
-run_swebench_loki() {
-    log_info "Executing SWE-bench Lite with Loki Mode Multi-Agent System..."
-    log_info "Model: $CLAUDE_MODEL | Retries: $MAX_RETRIES | Limit: ${PROBLEM_LIMIT:-all}"
-    log_info "Agents: Architect -> Engineer -> QA -> Reviewer (RARV cycle)"
-    log_info "Trajectory logging: ENABLED (for official submission)"
-    # Check if swebench is installed
-    if ! python3 -c "import swebench" 2>/dev/null; then
-        log_warning "SWE-bench package not installed. Installing..."
-        pip install -q swebench datasets
-    fi
-    export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL MAX_RETRIES
-    python3 << 'SWEBENCH_LOKI'
-import json
-import subprocess
-import os
-import sys
-import time
-import re
-from datetime import datetime
-try:
-    from datasets import load_dataset
-except ImportError:
-    subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'swebench', 'datasets'])
-    from datasets import load_dataset
-SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
-RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
-PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '0'))
-PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '300'))
-CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
-MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '3'))
-results_file = f"{RESULTS_DIR}/swebench-loki-results.json"
-patches_dir = f"{RESULTS_DIR}/swebench-loki-patches"
-trajs_dir = f"{RESULTS_DIR}/trajs"  # Trajectory logs for official submission
-logs_dir = f"{RESULTS_DIR}/logs"     # Execution logs for official submission
-os.makedirs(patches_dir, exist_ok=True)
-os.makedirs(trajs_dir, exist_ok=True)
-os.makedirs(logs_dir, exist_ok=True)
-print(f"\n{'='*70}")
-print(f"  LOKI MODE Multi-Agent SWE-bench Lite Benchmark")
-print(f"  Limit: {PROBLEM_LIMIT if PROBLEM_LIMIT > 0 else 'all'} | Model: {CLAUDE_MODEL} | Max Retries: {MAX_RETRIES}")
-print(f"  Agent Pipeline: Architect -> Engineer -> QA -> Reviewer")
-print(f"{'='*70}\n")
-# Load dataset
-print("Loading SWE-bench Lite dataset...")
-try:
-    dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
-    problems = list(dataset)
-    if PROBLEM_LIMIT > 0:
-        problems = problems[:PROBLEM_LIMIT]
-    print(f"Loaded {len(problems)} problems")
-except Exception as e:
-    print(f"Error loading dataset: {e}")
-    sys.exit(1)
-def call_agent(agent_name, prompt, timeout=PROBLEM_TIMEOUT):
-    """Call a Loki Mode agent with a specific role. Returns (output, error, metadata)."""
-    start_time = time.time()
-    try:
-        result = subprocess.run(
-            ['claude', '-p', prompt, '--model', CLAUDE_MODEL],
-            capture_output=True,
-            text=True,
-            timeout=timeout
-        )
-        elapsed = time.time() - start_time
-        return result.stdout.strip(), None, {
-            "agent": agent_name,
-            "model": CLAUDE_MODEL,
-            "elapsed_seconds": round(elapsed, 2),
-            "prompt_length": len(prompt),
-            "output_length": len(result.stdout),
-            "timestamp": datetime.now().isoformat()
-        }
-    except subprocess.TimeoutExpired:
-        elapsed = time.time() - start_time
-        return None, "TIMEOUT", {
-            "agent": agent_name,
-            "model": CLAUDE_MODEL,
-            "elapsed_seconds": round(elapsed, 2),
-            "error": "TIMEOUT",
-            "timestamp": datetime.now().isoformat()
-        }
-    except Exception as e:
-        return None, str(e), {
-            "agent": agent_name,
-            "error": str(e),
-            "timestamp": datetime.now().isoformat()
-        }
-def architect_agent(problem):
-    """Architect: Analyze the issue and design the fix approach."""
-    prompt = f'''You are the ARCHITECT AGENT analyzing a GitHub issue.
-REPOSITORY: {problem["repo"]}
-ISSUE:
-{problem["problem_statement"]}
-HINTS:
-{problem.get("hints_text", "No hints available.")}
-Your job:
-1. Understand what the issue is about
-2. Identify which file(s) likely need to be changed
-3. Describe the fix approach (2-3 sentences)
-4. Note any edge cases
-Output a brief analysis (5-7 lines max) with:
-- What the bug/issue is
-- Files likely affected
-- Fix strategy
-Keep it concise - the Engineer agent will generate the patch.'''
-    output, error, metadata = call_agent("Architect", prompt, timeout=120)
-    metadata["prompt"] = prompt
-    metadata["output"] = output
-    return output, error, metadata
-def engineer_agent(problem, architect_analysis):
-    """Engineer: Generate the patch based on architect's analysis."""
-    prompt = f'''You are the ENGINEER AGENT generating a patch for a GitHub issue.
-REPOSITORY: {problem["repo"]}
-ISSUE:
-{problem["problem_statement"]}
-ARCHITECT'S ANALYSIS:
-{architect_analysis}
-Generate a git patch (unified diff format) that fixes this issue.
-IMPORTANT:
-1. Output ONLY the patch in unified diff format
-2. Include proper file paths with a/ and b/ prefixes
-3. Include @@ line numbers
-4. No explanations, no markdown code blocks, just raw patch
-Example format:
---- a/path/to/file.py
-+++ b/path/to/file.py
-@@ -10,6 +10,7 @@
- existing line
-+new line
- existing line
-Generate the patch now:'''
-    output, error, metadata = call_agent("Engineer", prompt)
-    metadata["prompt"] = prompt
-    metadata["output"] = output
-    return output, error, metadata
-def qa_agent(patch):
-    """QA: Validate the patch format. Returns validation result with metadata."""
-    start_time = time.time()
-    if not patch:
-        return {"valid": False, "error": "Empty patch", "checks": [], "timestamp": datetime.now().isoformat()}
-    checks = []
-    # Check for basic patch structure
-    has_diff_header = "---" in patch and "+++" in patch
-    checks.append({"check": "diff_headers", "passed": has_diff_header})
-    has_hunk_header = "@@" in patch
-    checks.append({"check": "hunk_headers", "passed": has_hunk_header})
-    has_changes = "+" in patch or "-" in patch
-    checks.append({"check": "has_changes", "passed": has_changes})
-    # Check for markdown wrapping (common error)
-    is_wrapped = patch.startswith("```")
-    checks.append({"check": "no_markdown_wrap", "passed": not is_wrapped})
-    # Check for proper file paths
-    has_path_prefixes = "a/" in patch and "b/" in patch
-    checks.append({"check": "path_prefixes", "passed": has_path_prefixes})
-    elapsed = time.time() - start_time
-    if is_wrapped:
-        return {"valid": False, "error": "Patch wrapped in markdown code blocks", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
-    if not has_diff_header:
-        return {"valid": False, "error": "Missing diff headers (--- and +++)", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
-    if not has_hunk_header:
-        return {"valid": False, "error": "Missing hunk headers (@@)", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
-    if not has_changes:
-        return {"valid": False, "error": "No actual changes in patch", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
-    if not has_path_prefixes:
-        return {"valid": False, "error": "Missing a/ or b/ path prefixes", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
-    return {"valid": True, "error": None, "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
-def reviewer_agent(problem, patch, qa_result):
-    """Reviewer: Analyze patch issues and suggest fixes."""
-    if qa_result["valid"]:
-        return {"approved": True, "feedback": "Patch format is valid", "metadata": {"agent": "Reviewer", "skipped": True, "timestamp": datetime.now().isoformat()}}
-    prompt = f'''You are the CODE REVIEWER AGENT. The generated patch has format issues.
-ISSUE:
-{problem["problem_statement"][:500]}
-CURRENT PATCH:
-{patch[:1000] if patch else "Empty"}
-FORMAT ERROR:
-{qa_result["error"]}
-Provide brief feedback (2-3 lines) on how to fix the patch format:
-- What's wrong
-- How to fix it'''
-    feedback, error, metadata = call_agent("Reviewer", prompt, timeout=60)
-    metadata["prompt"] = prompt
-    metadata["output"] = feedback
-    return {"approved": False, "feedback": feedback or qa_result["error"], "error": error, "metadata": metadata}
-def engineer_fix_agent(problem, patch, feedback, attempt):
-    """Engineer: Fix the patch based on reviewer feedback."""
-    prompt = f'''You are the ENGINEER AGENT. Your previous patch had format issues.
-ISSUE:
-{problem["problem_statement"][:500]}
-PREVIOUS PATCH:
-{patch[:1000] if patch else "Empty"}
-REVIEWER FEEDBACK:
-{feedback}
-ATTEMPT: {attempt}/{MAX_RETRIES}
-Generate a CORRECTED patch in proper unified diff format.
-Output ONLY the raw patch - no explanations, no markdown.
---- a/path/to/file.py
-+++ b/path/to/file.py
-@@ -line,count +line,count @@
-...'''
-    output, error, metadata = call_agent("Engineer-Fix", prompt)
-    metadata["prompt"] = prompt
-    metadata["output"] = output
-    metadata["attempt"] = attempt
-    return output, error, metadata
-def clean_patch(patch):
-    """Clean up patch by removing markdown wrapping."""
-    if not patch:
-        return patch
-    if patch.startswith("```"):
-        lines = patch.split("\n")
-        # Remove first and last lines if they're markdown
-        if lines[0].startswith("```"):
-            lines = lines[1:]
-        if lines and lines[-1].strip() == "```":
-            lines = lines[:-1]
-        patch = "\n".join(lines)
-    return patch.strip()
-def save_trajectory(instance_id, trajectory_steps):
-    """Save the full reasoning trajectory to a file for official submission."""
-    safe_id = instance_id.replace("/", "_").replace(":", "_")
-    traj_file = f"{trajs_dir}/{safe_id}.md"
-    with open(traj_file, 'w') as f:
-        f.write(f"# Trajectory: {instance_id}\n\n")
-        f.write(f"**Generated by:** Loki Mode Multi-Agent System\n")
-        f.write(f"**Model:** {CLAUDE_MODEL}\n")
-        f.write(f"**Timestamp:** {datetime.now().isoformat()}\n\n")
-        f.write("---\n\n")
-        for i, step in enumerate(trajectory_steps, 1):
-            f.write(f"## Step {i}: {step['agent']}\n\n")
-            f.write(f"**Timestamp:** {step.get('timestamp', 'N/A')}\n")
-            f.write(f"**Duration:** {step.get('elapsed_seconds', 'N/A')}s\n\n")
-            if step.get('prompt'):
-                f.write("### Prompt\n\n```\n")
-                f.write(step['prompt'][:2000])
-                if len(step.get('prompt', '')) > 2000:
-                    f.write("\n... (truncated)")
-                f.write("\n```\n\n")
-            if step.get('output'):
-                f.write("### Output\n\n```\n")
-                f.write(step['output'])
-                f.write("\n```\n\n")
-            if step.get('error'):
-                f.write(f"### Error\n\n`{step['error']}`\n\n")
-            if step.get('checks'):
-                f.write("### Validation Checks\n\n")
-                for check in step['checks']:
-                    status = "PASS" if check['passed'] else "FAIL"
-                    f.write(f"- {check['check']}: {status}\n")
-                f.write("\n")
-            f.write("---\n\n")
-    return traj_file
-def save_logs(instance_id, patch, result):
-    """Save execution logs for official submission."""
-    safe_id = instance_id.replace("/", "_").replace(":", "_")
-    log_dir = f"{logs_dir}/{safe_id}"
-    os.makedirs(log_dir, exist_ok=True)
-    # Save patch.diff
-    patch_file = f"{log_dir}/patch.diff"
-    with open(patch_file, 'w') as f:
-        f.write(patch or "")
-    # Save report.json
-    report_file = f"{log_dir}/report.json"
-    report = {
-        "instance_id": instance_id,
-        "model_name_or_path": f"loki-mode-{CLAUDE_MODEL}",
-        "model_patch": patch or "",
-        "attempts": result.get("attempts", 1),
-        "success": result.get("error") is None,
-        "error": result.get("error"),
-        "timestamp": datetime.now().isoformat()
-    }
-    with open(report_file, 'w') as f:
-        json.dump(report, f, indent=2)
-    # Save test_output.txt (placeholder - would be filled by actual test run)
-    test_file = f"{log_dir}/test_output.txt"
-    with open(test_file, 'w') as f:
-        f.write(f"# Test output for {instance_id}\n")
-        f.write(f"# Generated by Loki Mode\n")
-        f.write(f"# Note: Run SWE-bench harness for actual test results\n\n")
-        f.write(f"Patch generated: {'Yes' if patch else 'No'}\n")
-        f.write(f"Attempts: {result.get('attempts', 1)}\n")
-        f.write(f"Error: {result.get('error', 'None')}\n")
-    return log_dir
-def solve_with_loki_mode(problem):
-    """Solve SWE-bench problem using Loki Mode multi-agent system with full trajectory logging."""
-    instance_id = problem["instance_id"]
-    trajectory_steps = []  # Full trajectory for official submission
-    agent_trace = []       # Summary trace for results JSON
-    # Step 1: Architect analyzes the issue
-    architect_analysis, error, arch_meta = architect_agent(problem)
-    trajectory_steps.append(arch_meta)
-    agent_trace.append({"agent": "Architect", "output": architect_analysis[:200] if architect_analysis else None, "error": error})
-    if error:
-        result = {
-            "instance_id": instance_id,
-            "model_patch": None,
-            "error": f"Architect failed: {error}",
-            "attempts": 1,
-            "agent_trace": agent_trace
-        }
-        save_trajectory(instance_id, trajectory_steps)
-        save_logs(instance_id, None, result)
-        return result
-    # Step 2: Engineer generates patch
-    patch, error, eng_meta = engineer_agent(problem, architect_analysis)
-    trajectory_steps.append(eng_meta)
-    agent_trace.append({"agent": "Engineer", "output": patch[:200] if patch else None, "error": error})
-    if error or not patch:
-        result = {
-            "instance_id": instance_id,
-            "model_patch": None,
-            "error": f"Engineer failed: {error}",
-            "attempts": 1,
-            "agent_trace": agent_trace
-        }
-        save_trajectory(instance_id, trajectory_steps)
-        save_logs(instance_id, None, result)
-        return result
-    patch = clean_patch(patch)
-    # RARV Loop: QA -> Reviewer -> Engineer-Fix
-    for attempt in range(1, MAX_RETRIES + 1):
-        # Step 3: QA validates patch format
-        qa_result = qa_agent(patch)
-        trajectory_steps.append({
-            "agent": "QA",
-            "timestamp": qa_result.get("timestamp"),
-            "elapsed_seconds": qa_result.get("elapsed_seconds"),
-            "output": f"Valid: {qa_result['valid']}, Error: {qa_result.get('error')}",
-            "checks": qa_result.get("checks", [])
-        })
-        agent_trace.append({"agent": "QA", "valid": qa_result["valid"], "error": qa_result.get("error")})
-        if qa_result["valid"]:
-            result = {
-                "instance_id": instance_id,
-                "model_patch": patch,
-                "error": None,
-                "attempts": attempt,
-                "agent_trace": agent_trace
-            }
-            save_trajectory(instance_id, trajectory_steps)
-            save_logs(instance_id, patch, result)
-            return result
-        if attempt >= MAX_RETRIES:
-            break
-        # Step 4: Reviewer analyzes issues
-        review = reviewer_agent(problem, patch, qa_result)
-        if review.get("metadata"):
-            trajectory_steps.append(review["metadata"])
-        agent_trace.append({"agent": "Reviewer", "feedback": review["feedback"][:200] if review.get("feedback") else None})
-        # Step 5: Engineer fixes patch
-        new_patch, error, fix_meta = engineer_fix_agent(problem, patch, review["feedback"], attempt + 1)
-        trajectory_steps.append(fix_meta)
-        agent_trace.append({"agent": f"Engineer-Fix-{attempt+1}", "output": new_patch[:200] if new_patch else None, "error": error})
-        if new_patch and not error:
-            patch = clean_patch(new_patch)
-    # Return even if format isn't perfect - let SWE-bench evaluator handle it
-    result = {
-        "instance_id": instance_id,
-        "model_patch": patch,
-        "error": f"Format issues after {MAX_RETRIES} attempts",
-        "attempts": MAX_RETRIES,
-        "agent_trace": agent_trace
-    }
-    save_trajectory(instance_id, trajectory_steps)
-    save_logs(instance_id, patch, result)
-    return result
-# Run benchmark
-results = {
-    "benchmark": "SWE-bench-LokiMode",
-    "mode": "multi-agent",
-    "version": "1.0",
-    "timestamp": datetime.now().isoformat(),
-    "model": CLAUDE_MODEL,
-    "max_retries": MAX_RETRIES,
-    "total_problems": len(problems),
-    "predictions": []
-}
-start_time = time.time()
-generated_count = 0
-fixed_by_rarv = 0
-error_count = 0
-total_attempts = 0
-for i, problem in enumerate(problems):
-    instance_id = problem["instance_id"]
-    print(f"[{i+1}/{len(problems)}] {instance_id}...", end=" ", flush=True)
-    result = solve_with_loki_mode(problem)
-    total_attempts += result["attempts"]
-    # Save patch
-    patch_file = f"{patches_dir}/{instance_id.replace('/', '_')}.patch"
-    with open(patch_file, 'w') as f:
-        f.write(f"# {instance_id}\n")
-        f.write(f"# Loki Mode Multi-Agent Patch\n")
-        f.write(f"# Attempts: {result['attempts']}\n\n")
-        if result["model_patch"]:
-            f.write(result["model_patch"])
-    if result["model_patch"] and not (result.get("error") or "").startswith("Format"):
-        generated_count += 1
-        if result["attempts"] > 1:
-            fixed_by_rarv += 1
-            print(f"\033[0;32mGENERATED\033[0m (fixed on attempt {result['attempts']})")
-        else:
-            print(f"\033[0;32mGENERATED\033[0m")
-    elif result["model_patch"]:
-        generated_count += 1
-        print(f"\033[0;33mGENERATED\033[0m (format issues)")
-    else:
-        error_count += 1
-        print(f"\033[0;31mERROR\033[0m - {result.get('error', 'Unknown')[:40]}")
-    # Add to predictions
-    results["predictions"].append({
-        "instance_id": instance_id,
-        "model_patch": result["model_patch"] or "",
-        "model_name_or_path": f"loki-mode-{CLAUDE_MODEL}",
-        "attempts": result["attempts"]
-    })
-elapsed_time = time.time() - start_time
-# Save results
-results["generated"] = generated_count
-results["fixed_by_rarv"] = fixed_by_rarv
-results["errors"] = error_count
-results["avg_attempts"] = total_attempts / len(problems) if problems else 0
-results["elapsed_time"] = elapsed_time
-with open(results_file, 'w') as f:
-    json.dump(results, f, indent=2)
-# Save predictions for SWE-bench evaluator
-predictions_file = f"{RESULTS_DIR}/swebench-loki-predictions.json"
-with open(predictions_file, 'w') as f:
-    json.dump(results["predictions"], f, indent=2)
-gen_rate = (generated_count / len(problems)) * 100 if problems else 0
-print(f"\n{'='*70}")
-print(f"  LOKI MODE SWE-BENCH RESULTS")
-print(f"{'='*70}")
-print(f"  Generated:    {generated_count}/{len(problems)} ({gen_rate:.1f}%)")
-print(f"  Fixed by RARV: {fixed_by_rarv}")
-print(f"  Errors:       {error_count}/{len(problems)}")
-print(f"  Avg Attempts: {results['avg_attempts']:.2f}")
-print(f"  Time:         {elapsed_time:.1f}s ({elapsed_time/len(problems):.1f}s avg)")
-print(f"{'='*70}")
-print(f"\n  Output Files (for official submission):")
-print(f"  - Predictions: {predictions_file}")
-print(f"  - Trajectories: {trajs_dir}/ ({len(os.listdir(trajs_dir))} files)")
-print(f"  - Logs: {logs_dir}/ ({len(os.listdir(logs_dir))} dirs)")
-print(f"{'='*70}")
-print(f"\n  Comparison:")
-print(f"  - Direct Claude:             99.67% patch gen")
-print(f"  - Loki Mode (multi-agent):   {gen_rate:.1f}% patch gen")
-print(f"{'='*70}")
-print(f"\n  Next Step: Run SWE-bench evaluator")
-print(f"  python -m swebench.harness.run_evaluation \\")
-print(f"    --predictions {predictions_file}")
-print(f"{'='*70}\n")
-SWEBENCH_LOKI
-    log_success "Loki Mode SWE-bench patch generation complete"
-    log_info "Results: $RESULTS_DIR/swebench-loki-results.json"
-    log_info "Predictions: $RESULTS_DIR/swebench-loki-predictions.json"
-}
-#===============================================================================
-# Summary Report
-#===============================================================================
-generate_summary() {
-    log_info "Generating benchmark summary..."
-    local humaneval_results="$RESULTS_DIR/humaneval-results.json"
-    local swebench_results="$RESULTS_DIR/swebench-results.json"
-    python3 << SUMMARY_GEN
-import json
-import os
-from datetime import datetime
-RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
-summary = f"""# Loki Mode Benchmark Results
-**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-## Overview
-This directory contains benchmark results for Loki Mode multi-agent system.
-"""
-# HumanEval results
-humaneval_file = f"{RESULTS_DIR}/humaneval-results.json"
-if os.path.exists(humaneval_file):
-    with open(humaneval_file) as f:
-        he = json.load(f)
-    if he.get("status") == "COMPLETED":
-        summary += f"""## HumanEval Results
-| Metric | Value |
-|--------|-------|
-| Problems | {he.get('total_problems', 'N/A')} |
-| Passed | {he.get('passed', 'N/A')} |
-| Failed | {he.get('failed', 'N/A')} |
-| **Pass Rate** | **{he.get('pass_rate', 'N/A')}%** |
-| Model | {he.get('model', 'N/A')} |
-| Time | {he.get('elapsed_seconds', 'N/A')}s |
-### Competitor Comparison
-| System | Pass@1 |
-|--------|--------|
-| MetaGPT | 85.9-87.7% |
-| **Loki Mode** | **{he.get('pass_rate', 'N/A')}%** |
-"""
-    else:
-        summary += f"""## HumanEval
-Status: {he.get('status', 'UNKNOWN')}
-To run: \`./benchmarks/run-benchmarks.sh humaneval --execute\`
-"""
-# SWE-bench results
-swebench_file = f"{RESULTS_DIR}/swebench-results.json"
-if os.path.exists(swebench_file):
-    with open(swebench_file) as f:
-        sb = json.load(f)
-    if sb.get("status") == "PATCHES_GENERATED":
-        summary += f"""## SWE-bench Lite Results
-| Metric | Value |
-|--------|-------|
-| Problems | {sb.get('total_problems', 'N/A')} |
-| Patches Generated | {sb.get('generated', 'N/A')} |
-| Errors | {sb.get('errors', 'N/A')} |
-| Model | {sb.get('model', 'N/A')} |
-| Time | {sb.get('elapsed_seconds', 'N/A')}s |
-**Next Step:** Run the SWE-bench evaluator to validate patches:
-\`\`\`bash
-python -m swebench.harness.run_evaluation \\
-    --predictions {sb.get('predictions_file', 'swebench-predictions.json')} \\
-    --max_workers 4
-\`\`\`
-"""
-    else:
-        summary += f"""## SWE-bench Lite
-Status: {sb.get('status', 'UNKNOWN')}
-To run: \`./benchmarks/run-benchmarks.sh swebench --execute\`
-"""
-summary += """## Methodology
-Loki Mode uses its multi-agent architecture to solve each problem:
-1. **Architect Agent** analyzes the problem
-2. **Engineer Agent** implements the solution
-3. **QA Agent** validates with test cases
-4. **Review Agent** checks code quality
-This mirrors real-world software development more accurately than single-agent approaches.
-## Running Benchmarks
-\`\`\`bash
-# Setup only (download datasets)
-./benchmarks/run-benchmarks.sh all
-# Execute with Claude
-./benchmarks/run-benchmarks.sh humaneval --execute
-./benchmarks/run-benchmarks.sh humaneval --execute --limit 10  # First 10 only
-./benchmarks/run-benchmarks.sh swebench --execute --limit 5    # First 5 only
-# Use different model
-./benchmarks/run-benchmarks.sh humaneval --execute --model opus
-\`\`\`
-"""
-with open(f"{RESULTS_DIR}/SUMMARY.md", 'w') as f:
-    f.write(summary)
-print(f"Summary saved to {RESULTS_DIR}/SUMMARY.md")
-SUMMARY_GEN
-    log_success "Summary generated: $RESULTS_DIR/SUMMARY.md"
-}
-#===============================================================================
-# Main
-#===============================================================================
-main() {
-    parse_args "$@"
-    echo ""
-    echo "========================================"
-    echo "  Loki Mode Benchmark Runner"
-    if [ "$EXECUTE_MODE" = true ]; then
-        echo "  Mode: EXECUTE"
-    else
-        echo "  Mode: SETUP"
-    fi
-    echo "========================================"
-    echo ""
-    export SCRIPT_DIR RESULTS_DIR PROJECT_DIR
-    setup_environment
-    case "$BENCHMARK" in
-        humaneval)
-            run_humaneval
-            ;;
-        swebench)
-            run_swebench
-            ;;
-        all)
-            run_humaneval
-            run_swebench
-            ;;
-        *)
-            log_error "Unknown benchmark: $BENCHMARK"
-            echo "Usage: $0 [humaneval|swebench|all] [--execute] [--limit N]"
-            exit 1
-            ;;
-    esac
-    generate_summary
-    echo ""
-    log_success "Benchmarks complete!"
-    log_info "Results directory: $RESULTS_DIR"
-    echo ""
-}
-main "$@"