npm - harness-evolver - Versions diffs - 3.3.1 → 4.0.2 - Mend

harness-evolver 3.3.1 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/.claude-plugin/plugin.json +1 -1
package/README.md +54 -29
package/agents/evolver-architect.md +56 -23
package/agents/evolver-consolidator.md +57 -0
package/agents/evolver-critic.md +58 -15
package/agents/evolver-proposer.md +13 -0
package/agents/evolver-testgen.md +22 -0
package/package.json +1 -1
package/skills/evolve/SKILL.md +288 -71
package/tools/__pycache__/add_evaluator.cpython-313.pyc +0 -0
package/tools/__pycache__/adversarial_inject.cpython-313.pyc +0 -0
package/tools/__pycache__/consolidate.cpython-313.pyc +0 -0
package/tools/__pycache__/iteration_gate.cpython-313.pyc +0 -0
package/tools/__pycache__/regression_tracker.cpython-313.pyc +0 -0
package/tools/__pycache__/synthesize_strategy.cpython-313.pyc +0 -0
package/tools/__pycache__/validate_state.cpython-313.pyc +0 -0
package/tools/add_evaluator.py +103 -0
package/tools/adversarial_inject.py +205 -0
package/tools/consolidate.py +235 -0
package/tools/iteration_gate.py +140 -0
package/tools/regression_tracker.py +175 -0
package/tools/synthesize_strategy.py +224 -0
package/tools/validate_state.py +212 -0
package/tools/__pycache__/detect_stack.cpython-314.pyc +0 -0
package/tools/__pycache__/trace_logger.cpython-314.pyc +0 -0

package/tools/adversarial_inject.py ADDED Viewed

@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""Inject adversarial examples into LangSmith dataset.
+Detects potential memorization by checking if agent outputs are suspiciously
+similar to reference outputs, then generates adversarial variations to test
+generalization.
+Usage:
+    python3 adversarial_inject.py \
+        --config .evolver.json \
+        --experiment v003a \
+        --output adversarial_report.json
+"""
+import argparse
+import json
+import os
+import platform
+import sys
+import random
+def ensure_langsmith_api_key():
+    """Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
+    if os.environ.get("LANGSMITH_API_KEY"):
+        return True
+    if platform.system() == "Darwin":
+        creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
+    else:
+        creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
+    if os.path.exists(creds_path):
+        try:
+            with open(creds_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("LANGSMITH_API_KEY="):
+                        key = line.split("=", 1)[1].strip()
+                        if key:
+                            os.environ["LANGSMITH_API_KEY"] = key
+                            return True
+        except OSError:
+            pass
+    if os.path.exists(".env"):
+        try:
+            with open(".env") as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
+                        key = line.split("=", 1)[1].strip().strip("'\"")
+                        if key:
+                            os.environ["LANGSMITH_API_KEY"] = key
+                            return True
+        except OSError:
+            pass
+    return False
+def detect_memorization(client, experiment_name, dataset_name):
+    """Check if agent outputs are suspiciously similar to reference outputs."""
+    suspicious = []
+    try:
+        runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
+        examples = {str(e.id): e for e in client.list_examples(dataset_name=dataset_name, limit=500)}
+        for run in runs:
+            if not run.reference_example_id:
+                continue
+            example = examples.get(str(run.reference_example_id))
+            if not example or not example.outputs:
+                continue
+            run_output = str(run.outputs or "").lower().strip()
+            ref_output = str(example.outputs).lower().strip()
+            if not run_output or not ref_output:
+                continue
+            if run_output == ref_output:
+                suspicious.append({
+                    "example_id": str(run.reference_example_id),
+                    "match_type": "exact",
+                    "input": str(run.inputs)[:200],
+                })
+            elif len(run_output) > 50 and ref_output in run_output:
+                suspicious.append({
+                    "example_id": str(run.reference_example_id),
+                    "match_type": "contains_reference",
+                    "input": str(run.inputs)[:200],
+                })
+    except Exception as e:
+        print(f"Error checking memorization: {e}", file=sys.stderr)
+    return suspicious
+def generate_adversarial_inputs(client, dataset_name, num_inputs=5):
+    """Generate adversarial variations of existing examples.
+    Creates multiple variation types to test generalization:
+    - negation: inverts the question to test if the agent distinguishes
+    - constraint: adds a constraint that changes the expected answer
+    - ambiguous: makes the input ambiguous to test robustness
+    - partial: provides incomplete input to test graceful handling
+    """
+    examples = list(client.list_examples(dataset_name=dataset_name, limit=100))
+    if not examples:
+        return []
+    adversarial = []
+    sampled = random.sample(examples, min(num_inputs, len(examples)))
+    variation_types = [
+        ("negation", "What is NOT the case: {input}"),
+        ("constraint", "{input} Answer in exactly one sentence."),
+        ("ambiguous", "Someone asked something like: {input}"),
+        ("partial", "{partial_input}"),
+    ]
+    for example in sampled:
+        input_data = example.inputs or {}
+        input_text = str(input_data.get("input", input_data))
+        # Pick a variation type (rotate through them)
+        idx = sampled.index(example) % len(variation_types)
+        vtype, template = variation_types[idx]
+        if vtype == "partial":
+            # Use first half of the input
+            words = input_text.split()
+            partial = " ".join(words[:max(len(words) // 2, 3)])
+            varied_input = template.format(partial_input=partial)
+        else:
+            varied_input = template.format(input=input_text)
+        adversarial.append({
+            "inputs": {"input": varied_input},
+            "metadata": {
+                "source": "adversarial",
+                "original_example_id": str(example.id),
+                "variation_type": vtype,
+            },
+        })
+    return adversarial
+def inject_adversarial(client, dataset_id, adversarial_inputs):
+    """Add adversarial examples to dataset."""
+    added = 0
+    for adv in adversarial_inputs:
+        try:
+            client.create_example(
+                inputs=adv["inputs"],
+                dataset_id=dataset_id,
+                metadata=adv["metadata"],
+            )
+            added += 1
+        except Exception as e:
+            print(f"Failed to inject: {e}", file=sys.stderr)
+    return added
+def main():
+    parser = argparse.ArgumentParser(description="Adversarial injection for evaluators")
+    parser.add_argument("--config", default=".evolver.json")
+    parser.add_argument("--experiment", required=True, help="Experiment to check for memorization")
+    parser.add_argument("--output", default=None, help="Output report path")
+    parser.add_argument("--inject", action="store_true", help="Actually inject adversarial examples")
+    parser.add_argument("--num-adversarial", type=int, default=5, help="Number of adversarial examples")
+    args = parser.parse_args()
+    with open(args.config) as f:
+        config = json.load(f)
+    ensure_langsmith_api_key()
+    from langsmith import Client
+    client = Client()
+    suspicious = detect_memorization(client, args.experiment, config["dataset"])
+    adversarial = generate_adversarial_inputs(client, config["dataset"], args.num_adversarial)
+    injected = 0
+    if args.inject and adversarial:
+        injected = inject_adversarial(client, config["dataset_id"], adversarial)
+    result = {
+        "memorization_suspects": len(suspicious),
+        "suspicious_examples": suspicious,
+        "adversarial_generated": len(adversarial),
+        "adversarial_injected": injected,
+    }
+    output = json.dumps(result, indent=2)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(output)
+    print(output)
+    if suspicious:
+        print(f"\nWARNING: {len(suspicious)} examples show potential memorization!", file=sys.stderr)
+if __name__ == "__main__":
+    main()

package/tools/consolidate.py ADDED Viewed

@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""Cross-iteration memory consolidation for Harness Evolver.
+Inspired by Claude Code's autoDream pattern. Analyzes evolution history
+to identify recurring patterns, successful strategies, and wasted approaches.
+Produces evolution_memory.md for proposer briefings.
+Usage:
+    python3 consolidate.py --config .evolver.json --output evolution_memory.md
+"""
+import argparse
+import json
+import os
+import sys
+from datetime import datetime, timezone
+def orient(config):
+    """Phase 1: Scan current state and history."""
+    history = config.get("history", [])
+    iterations = config.get("iterations", 0)
+    best_score = config.get("best_score", 0)
+    baseline_score = history[0]["score"] if history else 0
+    return {
+        "iterations": iterations,
+        "best_score": best_score,
+        "baseline_score": baseline_score,
+        "improvement": best_score - baseline_score,
+        "history": history,
+    }
+def gather(config, comparison_files):
+    """Phase 2: Extract signals from trace insights and comparisons."""
+    signals = {
+        "winning_strategies": [],
+        "losing_strategies": [],
+        "recurring_failures": {},
+        "score_deltas": [],
+    }
+    for comp_file in comparison_files:
+        if not os.path.exists(comp_file):
+            continue
+        try:
+            with open(comp_file) as f:
+                data = json.load(f)
+            comparison = data.get("comparison", data)
+            winner = comparison.get("winner", {})
+            if winner:
+                signals["winning_strategies"].append({
+                    "experiment": winner.get("experiment", ""),
+                    "score": winner.get("score", 0),
+                })
+            for candidate in comparison.get("all_candidates", []):
+                if candidate.get("experiment") != winner.get("experiment"):
+                    signals["losing_strategies"].append({
+                        "experiment": candidate.get("experiment", ""),
+                        "score": candidate.get("score", 0),
+                    })
+        except (json.JSONDecodeError, OSError):
+            continue
+    # Compute score deltas from history
+    history = config.get("history", [])
+    for i in range(1, len(history)):
+        signals["score_deltas"].append({
+            "version": history[i]["version"],
+            "delta": history[i]["score"] - history[i - 1]["score"],
+            "score": history[i]["score"],
+        })
+    # Read trace insights for recurring patterns
+    if os.path.exists("trace_insights.json"):
+        try:
+            with open("trace_insights.json") as f:
+                insights = json.load(f)
+            for issue in insights.get("top_issues", []):
+                pattern = issue.get("pattern", issue.get("description", "unknown"))
+                if pattern not in signals["recurring_failures"]:
+                    signals["recurring_failures"][pattern] = 0
+                signals["recurring_failures"][pattern] += 1
+        except (json.JSONDecodeError, OSError):
+            pass
+    return signals
+def consolidate(orientation, signals, existing_memory=None):
+    """Phase 3: Merge signals into consolidated memory."""
+    insights = []
+    # Strategy effectiveness
+    winning = signals.get("winning_strategies", [])
+    strategy_map = {"a": "exploit", "b": "explore", "c": "crossover", "d": "failure-targeted-1", "e": "failure-targeted-2"}
+    win_counts = {}
+    for w in winning:
+        exp = w.get("experiment", "")
+        if exp:
+            suffix = exp[-1]
+            name = strategy_map.get(suffix, suffix)
+            win_counts[name] = win_counts.get(name, 0) + 1
+    if win_counts:
+        best_strategy = max(win_counts, key=win_counts.get)
+        insights.append({
+            "type": "strategy_effectiveness",
+            "insight": f"Most winning strategy: {best_strategy} ({win_counts[best_strategy]} wins)",
+            "recurrence": win_counts[best_strategy],
+            "data": win_counts,
+        })
+    # Recurring failures (only promote if seen 2+ times)
+    recurring = {k: v for k, v in signals.get("recurring_failures", {}).items() if v >= 2}
+    for pattern, count in sorted(recurring.items(), key=lambda x: -x[1]):
+        insights.append({
+            "type": "recurring_failure",
+            "insight": f"Recurring failure ({count}x): {pattern}",
+            "recurrence": count,
+        })
+    # Score trajectory
+    deltas = signals.get("score_deltas", [])
+    if deltas:
+        positive = [d for d in deltas if d["delta"] > 0]
+        negative = [d for d in deltas if d["delta"] < 0]
+        stagnant = [d for d in deltas if abs(d["delta"]) < 0.01]
+        insights.append({
+            "type": "trajectory",
+            "insight": f"Score trajectory: {len(positive)} improvements, {len(negative)} regressions, {len(stagnant)} stagnant",
+            "recurrence": len(deltas),
+        })
+    # Merge with existing memory (update recurrence counts)
+    if existing_memory:
+        for existing in existing_memory.get("insights", []):
+            found = False
+            for new in insights:
+                if new["type"] == existing["type"] and new["insight"] == existing["insight"]:
+                    new["recurrence"] = max(new["recurrence"], existing.get("recurrence", 1)) + 1
+                    found = True
+                    break
+            if not found and existing.get("recurrence", 1) >= 2:
+                insights.append(existing)
+    return insights
+def prune(insights, max_insights=20):
+    """Phase 4: Cap size, remove stale entries."""
+    sorted_insights = sorted(insights, key=lambda x: -x.get("recurrence", 1))
+    return sorted_insights[:max_insights]
+def format_memory(orientation, insights):
+    """Format consolidated memory as markdown."""
+    lines = [
+        "# Evolution Memory",
+        "",
+        f"*Last updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
+        f"*Iterations: {orientation['iterations']} | Best: {orientation['best_score']:.3f} | Baseline: {orientation['baseline_score']:.3f} | Improvement: +{orientation['improvement']:.3f}*",
+        "",
+        "## Key Insights (promoted after 2+ recurrences)",
+        "",
+    ]
+    promoted = [i for i in insights if i.get("recurrence", 1) >= 2]
+    other = [i for i in insights if i.get("recurrence", 1) < 2]
+    if promoted:
+        for insight in promoted:
+            lines.append(f"- **[{insight['type']}]** {insight['insight']} (seen {insight['recurrence']}x)")
+    else:
+        lines.append("- No insights promoted yet (need 2+ recurrences)")
+    if other:
+        lines.append("")
+        lines.append("## Observations (1 recurrence, pending promotion)")
+        lines.append("")
+        for insight in other:
+            lines.append(f"- [{insight['type']}] {insight['insight']}")
+    lines.append("")
+    return "\n".join(lines)
+def main():
+    parser = argparse.ArgumentParser(description="Cross-iteration memory consolidation")
+    parser.add_argument("--config", default=".evolver.json")
+    parser.add_argument("--output", default="evolution_memory.md", help="Output markdown path")
+    parser.add_argument("--output-json", default="evolution_memory.json", help="Output JSON path")
+    parser.add_argument("--comparison-files", nargs="*", default=[], help="Past comparison.json files")
+    args = parser.parse_args()
+    with open(args.config) as f:
+        config = json.load(f)
+    # Load existing memory if present
+    existing = None
+    if os.path.exists(args.output_json):
+        try:
+            with open(args.output_json) as f:
+                existing = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            pass
+    # Four-phase consolidation
+    orientation = orient(config)
+    signals = gather(config, args.comparison_files or ["comparison.json"])
+    insights = consolidate(orientation, signals, existing)
+    insights = prune(insights)
+    # Write markdown
+    memory_md = format_memory(orientation, insights)
+    with open(args.output, "w") as f:
+        f.write(memory_md)
+    # Write JSON for programmatic access
+    memory_json = {
+        "updated_at": datetime.now(timezone.utc).isoformat(),
+        "orientation": orientation,
+        "insights": insights,
+    }
+    with open(args.output_json, "w") as f:
+        json.dump(memory_json, f, indent=2)
+    print(memory_md)
+if __name__ == "__main__":
+    main()

package/tools/iteration_gate.py ADDED Viewed

@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""Three-gate iteration trigger for Harness Evolver.
+Evaluates whether the next evolution iteration should proceed based on:
+1. Score gate: skip if no meaningful delta or no clustered failures
+2. Cost gate: estimate token cost, stop if budget exceeded
+3. Convergence gate: detect statistical plateau
+Usage:
+    python3 iteration_gate.py --config .evolver.json --output gate_result.json
+"""
+import argparse
+import json
+import os
+import sys
+from datetime import datetime, timezone
+def score_gate(config, threshold=0.02):
+    """Check if there's meaningful room for improvement."""
+    history = config.get("history", [])
+    if len(history) < 2:
+        return {"pass": True, "reason": "Not enough history to evaluate"}
+    recent = [h["score"] for h in history[-3:]]
+    best = config.get("best_score", 0)
+    target = config.get("target_score")
+    if target and best >= target:
+        return {"pass": False, "reason": f"Target reached: {best:.3f} >= {target}"}
+    if len(recent) >= 3:
+        score_range = max(recent) - min(recent)
+        if score_range < threshold:
+            return {
+                "pass": False,
+                "reason": f"Plateau detected: last 3 scores within {score_range:.4f} (threshold: {threshold})",
+                "suggest": "architect",
+            }
+    return {"pass": True, "reason": f"Score delta exists: range={max(recent)-min(recent):.4f}"}
+def cost_gate(config, budget_tokens=None):
+    """Estimate cost of next iteration and check against budget."""
+    history = config.get("history", [])
+    iterations = config.get("iterations", 0)
+    estimated_cost = config.get("iteration_costs", {})
+    if not estimated_cost and iterations == 0:
+        return {"pass": True, "reason": "First iteration, no cost data yet"}
+    total_spent = sum(estimated_cost.get("per_iteration", [0]))
+    budget = budget_tokens or estimated_cost.get("budget_tokens")
+    if not budget:
+        return {"pass": True, "reason": "No budget configured"}
+    avg_cost = total_spent / max(iterations, 1)
+    remaining = budget - total_spent
+    if remaining < avg_cost * 0.5:
+        return {
+            "pass": False,
+            "reason": f"Budget nearly exhausted: {remaining:,} tokens remaining, avg iteration costs {avg_cost:,.0f}",
+        }
+    return {"pass": True, "reason": f"Budget OK: {remaining:,} tokens remaining"}
+def convergence_gate(config, min_improvement=0.005, lookback=5):
+    """Detect statistical convergence using diminishing returns."""
+    history = config.get("history", [])
+    if len(history) < 3:
+        return {"pass": True, "reason": "Not enough iterations for convergence analysis"}
+    recent = history[-lookback:] if len(history) >= lookback else history
+    deltas = []
+    for i in range(1, len(recent)):
+        deltas.append(recent[i]["score"] - recent[i - 1]["score"])
+    if not deltas:
+        return {"pass": True, "reason": "No deltas to analyze"}
+    avg_delta = sum(deltas) / len(deltas)
+    positive_deltas = [d for d in deltas if d > 0]
+    improvement_rate = len(positive_deltas) / len(deltas)
+    if avg_delta < min_improvement and improvement_rate < 0.4:
+        return {
+            "pass": False,
+            "reason": f"Converged: avg delta={avg_delta:.4f}, improvement rate={improvement_rate:.0%}",
+            "suggest": "architect" if improvement_rate < 0.2 else "continue_cautious",
+        }
+    return {
+        "pass": True,
+        "reason": f"Still improving: avg delta={avg_delta:.4f}, improvement rate={improvement_rate:.0%}",
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Three-gate iteration trigger")
+    parser.add_argument("--config", default=".evolver.json", help="Config path")
+    parser.add_argument("--output", default=None, help="Output JSON path")
+    parser.add_argument("--score-threshold", type=float, default=0.02, help="Score plateau threshold")
+    parser.add_argument("--budget-tokens", type=int, default=None, help="Token budget override")
+    args = parser.parse_args()
+    with open(args.config) as f:
+        config = json.load(f)
+    gates = {
+        "score": score_gate(config, args.score_threshold),
+        "cost": cost_gate(config, args.budget_tokens),
+        "convergence": convergence_gate(config),
+    }
+    all_pass = all(g["pass"] for g in gates.values())
+    suggestions = [g.get("suggest") for g in gates.values() if g.get("suggest")]
+    result = {
+        "proceed": all_pass,
+        "gates": gates,
+        "suggestions": suggestions,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+    output = json.dumps(result, indent=2)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(output)
+    print(output)
+    sys.exit(0 if all_pass else 1)
+if __name__ == "__main__":
+    main()