npm - harness-evolver - Versions diffs - 3.3.1 → 4.0.2 - Mend

harness-evolver 3.3.1 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/.claude-plugin/plugin.json +1 -1
package/README.md +54 -29
package/agents/evolver-architect.md +56 -23
package/agents/evolver-consolidator.md +57 -0
package/agents/evolver-critic.md +58 -15
package/agents/evolver-proposer.md +13 -0
package/agents/evolver-testgen.md +22 -0
package/package.json +1 -1
package/skills/evolve/SKILL.md +288 -71
package/tools/__pycache__/add_evaluator.cpython-313.pyc +0 -0
package/tools/__pycache__/adversarial_inject.cpython-313.pyc +0 -0
package/tools/__pycache__/consolidate.cpython-313.pyc +0 -0
package/tools/__pycache__/iteration_gate.cpython-313.pyc +0 -0
package/tools/__pycache__/regression_tracker.cpython-313.pyc +0 -0
package/tools/__pycache__/synthesize_strategy.cpython-313.pyc +0 -0
package/tools/__pycache__/validate_state.cpython-313.pyc +0 -0
package/tools/add_evaluator.py +103 -0
package/tools/adversarial_inject.py +205 -0
package/tools/consolidate.py +235 -0
package/tools/iteration_gate.py +140 -0
package/tools/regression_tracker.py +175 -0
package/tools/synthesize_strategy.py +224 -0
package/tools/validate_state.py +212 -0
package/tools/__pycache__/detect_stack.cpython-314.pyc +0 -0
package/tools/__pycache__/trace_logger.cpython-314.pyc +0 -0

package/tools/regression_tracker.py ADDED Viewed

@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""Track regression examples across evolution iterations.
+Compares per-example scores between consecutive iterations.
+When an example transitions from failing (<0.5) to passing (>0.8),
+adds a variation to the dataset as a regression guard.
+Usage:
+    python3 regression_tracker.py \
+        --config .evolver.json \
+        --previous-experiment v001a \
+        --current-experiment v002c \
+        --output regression_report.json
+"""
+import argparse
+import json
+import os
+import platform
+import sys
+def ensure_langsmith_api_key():
+    """Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
+    if os.environ.get("LANGSMITH_API_KEY"):
+        return True
+    if platform.system() == "Darwin":
+        creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
+    else:
+        creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
+    if os.path.exists(creds_path):
+        try:
+            with open(creds_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("LANGSMITH_API_KEY="):
+                        key = line.split("=", 1)[1].strip()
+                        if key:
+                            os.environ["LANGSMITH_API_KEY"] = key
+                            return True
+        except OSError:
+            pass
+    if os.path.exists(".env"):
+        try:
+            with open(".env") as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
+                        key = line.split("=", 1)[1].strip().strip("'\"")
+                        if key:
+                            os.environ["LANGSMITH_API_KEY"] = key
+                            return True
+        except OSError:
+            pass
+    return False
+def get_per_example_scores(client, experiment_name):
+    """Get per-example scores from an experiment."""
+    scores = {}
+    try:
+        runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
+        for run in runs:
+            example_id = str(run.reference_example_id or run.id)
+            feedbacks = list(client.list_feedback(run_ids=[run.id]))
+            fb_scores = {}
+            for fb in feedbacks:
+                if fb.score is not None:
+                    fb_scores[fb.key] = fb.score
+            avg = sum(fb_scores.values()) / len(fb_scores) if fb_scores else 0.0
+            scores[example_id] = {
+                "score": avg,
+                "input": str(run.inputs)[:500] if run.inputs else "",
+                "output": str(run.outputs)[:500] if run.outputs else "",
+            }
+    except Exception as e:
+        print(f"Error reading {experiment_name}: {e}", file=sys.stderr)
+    return scores
+def find_transitions(prev_scores, curr_scores, fail_threshold=0.5, pass_threshold=0.8):
+    """Find examples that transitioned from failing to passing."""
+    transitions = []
+    regressions = []
+    for example_id in set(prev_scores) & set(curr_scores):
+        prev = prev_scores[example_id]["score"]
+        curr = curr_scores[example_id]["score"]
+        if prev < fail_threshold and curr >= pass_threshold:
+            transitions.append({
+                "example_id": example_id,
+                "prev_score": prev,
+                "curr_score": curr,
+                "type": "fixed",
+                "input": curr_scores[example_id]["input"],
+            })
+        elif prev >= pass_threshold and curr < fail_threshold:
+            regressions.append({
+                "example_id": example_id,
+                "prev_score": prev,
+                "curr_score": curr,
+                "type": "regressed",
+                "input": curr_scores[example_id]["input"],
+            })
+    return transitions, regressions
+def add_regression_guards(client, dataset_id, transitions, max_guards=5):
+    """Add regression guard examples to the dataset."""
+    added = 0
+    for t in transitions[:max_guards]:
+        try:
+            input_data = json.loads(t["input"]) if t["input"].startswith("{") else {"input": t["input"]}
+            client.create_example(
+                inputs=input_data,
+                dataset_id=dataset_id,
+                metadata={"source": "regression_guard", "original_example_id": t["example_id"]},
+            )
+            added += 1
+        except Exception as e:
+            print(f"Failed to add guard for {t['example_id']}: {e}", file=sys.stderr)
+    return added
+def main():
+    parser = argparse.ArgumentParser(description="Track regressions across iterations")
+    parser.add_argument("--config", default=".evolver.json")
+    parser.add_argument("--previous-experiment", required=True, help="Previous iteration experiment name")
+    parser.add_argument("--current-experiment", required=True, help="Current iteration experiment name")
+    parser.add_argument("--output", default=None, help="Output JSON report")
+    parser.add_argument("--add-guards", action="store_true", help="Add regression guard examples to dataset")
+    parser.add_argument("--max-guards", type=int, default=5, help="Max guard examples to add")
+    args = parser.parse_args()
+    with open(args.config) as f:
+        config = json.load(f)
+    ensure_langsmith_api_key()
+    from langsmith import Client
+    client = Client()
+    prev_scores = get_per_example_scores(client, args.previous_experiment)
+    curr_scores = get_per_example_scores(client, args.current_experiment)
+    transitions, regressions = find_transitions(prev_scores, curr_scores)
+    added = 0
+    if args.add_guards and transitions:
+        added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards)
+    result = {
+        "previous": args.previous_experiment,
+        "current": args.current_experiment,
+        "fixed_count": len(transitions),
+        "regression_count": len(regressions),
+        "guards_added": added,
+        "fixed": transitions,
+        "regressions": regressions,
+    }
+    output = json.dumps(result, indent=2)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(output)
+    print(output)
+    if regressions:
+        print(f"\nWARNING: {len(regressions)} regressions detected!", file=sys.stderr)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/tools/synthesize_strategy.py ADDED Viewed

@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""Synthesize evolution strategy document from trace analysis.
+Reads trace_insights.json, best_results.json, evolution_memory.json,
+and production_seed.json to produce a targeted strategy document with
+specific file paths and concrete change recommendations for proposers.
+Usage:
+    python3 synthesize_strategy.py \
+        --config .evolver.json \
+        --trace-insights trace_insights.json \
+        --best-results best_results.json \
+        --evolution-memory evolution_memory.json \
+        --production-seed production_seed.json \
+        --output strategy.md
+"""
+import argparse
+import json
+import os
+import sys
+def load_json_safe(path):
+    """Load JSON file, return None if missing or invalid."""
+    if not path or not os.path.exists(path):
+        return None
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except (json.JSONDecodeError, OSError):
+        return None
+def identify_target_files(config):
+    """Identify which files proposers should focus on."""
+    entry_point = config.get("entry_point", "")
+    parts = entry_point.split()
+    target_files = []
+    for part in parts:
+        if part.endswith(".py") and not part.startswith("-"):
+            target_files.append(part)
+    return target_files
+def synthesize(config, insights, results, memory, production=None):
+    """Produce strategy recommendations."""
+    strategy = {
+        "primary_targets": [],
+        "failure_clusters": [],
+        "recommended_approaches": [],
+        "avoid": [],
+    }
+    strategy["primary_targets"] = identify_target_files(config)
+    if insights:
+        for issue in insights.get("top_issues", [])[:5]:
+            strategy["failure_clusters"].append({
+                "type": issue.get("type", "unknown"),
+                "severity": issue.get("severity", "medium"),
+                "description": issue.get("description", ""),
+                "count": issue.get("count", 0),
+            })
+    if memory:
+        for insight in memory.get("insights", []):
+            if insight.get("recurrence", 0) >= 2:
+                if insight["type"] == "strategy_effectiveness":
+                    strategy["recommended_approaches"].append(insight["insight"])
+                elif insight["type"] == "recurring_failure":
+                    strategy["failure_clusters"].append({
+                        "type": "recurring",
+                        "severity": "high",
+                        "description": insight["insight"],
+                        "count": insight["recurrence"],
+                    })
+    if memory:
+        for insight in memory.get("insights", []):
+            if "losing" in insight.get("type", "") or "regression" in insight.get("type", ""):
+                strategy["avoid"].append(insight["insight"])
+    if results:
+        per_example = results.get("per_example", {})
+        failing = [(eid, data) for eid, data in per_example.items() if data.get("score", 0) < 0.5]
+        failing.sort(key=lambda x: x[1].get("score", 0))
+        strategy["failing_examples"] = [
+            {
+                "example_id": eid,
+                "score": data["score"],
+                "input_preview": data.get("input_preview", "")[:200],
+                "error": data.get("error"),
+            }
+            for eid, data in failing[:10]
+        ]
+    # Production trace data
+    if production:
+        prod_data = {}
+        stats = production.get("stats", {})
+        if stats:
+            prod_data["total_traces"] = stats.get("total_traces", 0)
+            prod_data["error_rate"] = stats.get("error_rate", 0)
+        categories = production.get("categories", [])
+        if categories:
+            prod_data["traffic_distribution"] = categories[:10]
+        neg = production.get("negative_feedback_inputs", [])
+        if neg:
+            prod_data["negative_feedback"] = neg[:5]
+        errors = production.get("error_patterns", production.get("errors", []))
+        if errors:
+            prod_data["production_errors"] = errors[:5] if isinstance(errors, list) else []
+        slow = production.get("slow_queries", [])
+        if slow:
+            prod_data["slow_queries"] = slow[:5]
+        if prod_data:
+            strategy["production"] = prod_data
+    return strategy
+def format_strategy_md(strategy, config):
+    """Format strategy as markdown document."""
+    lines = [
+        "# Evolution Strategy Document",
+        "",
+        f"*Framework: {config.get('framework', 'unknown')} | Entry point: {config.get('entry_point', 'N/A')}*",
+        "",
+    ]
+    lines.append("## Target Files")
+    for f in strategy.get("primary_targets", []):
+        lines.append(f"- `{f}`")
+    lines.append("")
+    clusters = strategy.get("failure_clusters", [])
+    if clusters:
+        lines.append("## Failure Clusters (prioritized)")
+        for i, c in enumerate(clusters, 1):
+            lines.append(f"{i}. **[{c['severity']}]** {c['description']} (count: {c['count']})")
+        lines.append("")
+    approaches = strategy.get("recommended_approaches", [])
+    if approaches:
+        lines.append("## Recommended Approaches (from evolution memory)")
+        for a in approaches:
+            lines.append(f"- {a}")
+        lines.append("")
+    avoid = strategy.get("avoid", [])
+    if avoid:
+        lines.append("## Avoid (previously unsuccessful)")
+        for a in avoid:
+            lines.append(f"- {a}")
+        lines.append("")
+    failing = strategy.get("failing_examples", [])
+    if failing:
+        lines.append(f"## Top Failing Examples ({len(failing)})")
+        for ex in failing:
+            score = ex["score"]
+            preview = ex["input_preview"][:100]
+            error = f" — Error: {ex['error'][:80]}" if ex.get("error") else ""
+            lines.append(f"- `{ex['example_id']}` (score: {score:.2f}): {preview}{error}")
+        lines.append("")
+    prod = strategy.get("production", {})
+    if prod:
+        lines.append("## Production Insights")
+        if prod.get("total_traces"):
+            lines.append(f"- **Traces**: {prod['total_traces']} total, {prod.get('error_rate', 0):.1%} error rate")
+        if prod.get("traffic_distribution"):
+            lines.append(f"- **Traffic**: {', '.join(str(c) for c in prod['traffic_distribution'][:5])}")
+        if prod.get("negative_feedback"):
+            lines.append("- **Negative feedback inputs**:")
+            for nf in prod["negative_feedback"]:
+                lines.append(f"  - {str(nf)[:120]}")
+        if prod.get("production_errors"):
+            lines.append("- **Production errors**:")
+            for pe in prod["production_errors"]:
+                lines.append(f"  - {str(pe)[:120]}")
+        if prod.get("slow_queries"):
+            lines.append("- **Slow queries**:")
+            for sq in prod["slow_queries"]:
+                lines.append(f"  - {str(sq)[:120]}")
+        lines.append("")
+    return "\n".join(lines)
+def main():
+    parser = argparse.ArgumentParser(description="Synthesize evolution strategy")
+    parser.add_argument("--config", default=".evolver.json")
+    parser.add_argument("--trace-insights", default="trace_insights.json")
+    parser.add_argument("--best-results", default="best_results.json")
+    parser.add_argument("--evolution-memory", default="evolution_memory.json")
+    parser.add_argument("--production-seed", default="production_seed.json")
+    parser.add_argument("--output", default="strategy.md")
+    args = parser.parse_args()
+    with open(args.config) as f:
+        config = json.load(f)
+    insights = load_json_safe(args.trace_insights)
+    results = load_json_safe(args.best_results)
+    memory = load_json_safe(args.evolution_memory)
+    production = load_json_safe(args.production_seed)
+    strategy = synthesize(config, insights, results, memory, production)
+    md = format_strategy_md(strategy, config)
+    with open(args.output, "w") as f:
+        f.write(md)
+    json_path = args.output.replace(".md", ".json")
+    with open(json_path, "w") as f:
+        json.dump(strategy, f, indent=2)
+    print(md)
+if __name__ == "__main__":
+    main()

package/tools/validate_state.py ADDED Viewed

@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""Validate .evolver.json state against LangSmith reality.
+Checks that referenced experiments, datasets, and projects still exist.
+Returns JSON with validation results and any divergences found.
+Usage:
+    python3 validate_state.py --config .evolver.json --output validation.json
+"""
+import argparse
+import json
+import os
+import platform
+import sys
+def ensure_langsmith_api_key():
+    """Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
+    if os.environ.get("LANGSMITH_API_KEY"):
+        return True
+    if platform.system() == "Darwin":
+        creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
+    else:
+        creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
+    if os.path.exists(creds_path):
+        try:
+            with open(creds_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("LANGSMITH_API_KEY="):
+                        key = line.split("=", 1)[1].strip()
+                        if key:
+                            os.environ["LANGSMITH_API_KEY"] = key
+                            return True
+        except OSError:
+            pass
+    if os.path.exists(".env"):
+        try:
+            with open(".env") as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
+                        key = line.split("=", 1)[1].strip().strip("'\"")
+                        if key:
+                            os.environ["LANGSMITH_API_KEY"] = key
+                            return True
+        except OSError:
+            pass
+    return False
+def validate_dataset(client, config):
+    """Check dataset exists and has expected example count."""
+    issues = []
+    dataset_name = config.get("dataset")
+    dataset_id = config.get("dataset_id")
+    if not dataset_name:
+        issues.append({"field": "dataset", "severity": "critical", "message": "No dataset configured"})
+        return issues, 0
+    try:
+        dataset = client.read_dataset(dataset_name=dataset_name)
+        if dataset_id and str(dataset.id) != dataset_id:
+            issues.append({
+                "field": "dataset_id",
+                "severity": "warning",
+                "message": f"dataset_id mismatch: config has {dataset_id}, LangSmith has {dataset.id}",
+            })
+        count = len(list(client.list_examples(dataset_id=dataset.id, limit=500)))
+        return issues, count
+    except Exception as e:
+        issues.append({"field": "dataset", "severity": "critical", "message": f"Dataset not found: {e}"})
+        return issues, 0
+def validate_best_experiment(client, config):
+    """Check best_experiment still exists and score matches."""
+    issues = []
+    best = config.get("best_experiment")
+    if not best:
+        return issues
+    try:
+        runs = list(client.list_runs(project_name=best, is_root=True, limit=1))
+        if not runs:
+            issues.append({
+                "field": "best_experiment",
+                "severity": "critical",
+                "message": f"Best experiment '{best}' has no runs in LangSmith",
+            })
+    except Exception as e:
+        issues.append({
+            "field": "best_experiment",
+            "severity": "critical",
+            "message": f"Best experiment '{best}' not accessible: {e}",
+        })
+    return issues
+def validate_git_state(config):
+    """Check that current git HEAD matches expected state."""
+    import subprocess
+    issues = []
+    try:
+        result = subprocess.run(
+            ["git", "log", "--oneline", "-1"],
+            capture_output=True, text=True, timeout=10,
+        )
+        head = result.stdout.strip()
+        if not head:
+            issues.append({"field": "git", "severity": "warning", "message": "Could not read git HEAD"})
+    except Exception as e:
+        issues.append({"field": "git", "severity": "warning", "message": f"Git check failed: {e}"})
+    return issues
+def main():
+    parser = argparse.ArgumentParser(description="Validate .evolver.json against LangSmith")
+    parser.add_argument("--config", default=".evolver.json", help="Config path")
+    parser.add_argument("--output", default=None, help="Output JSON path")
+    parser.add_argument("--fix", action="store_true", help="Auto-fix divergences where possible")
+    args = parser.parse_args()
+    if not os.path.exists(args.config):
+        print(json.dumps({"valid": False, "issues": [{"severity": "critical", "message": f"{args.config} not found"}]}))
+        sys.exit(1)
+    with open(args.config) as f:
+        config = json.load(f)
+    ensure_langsmith_api_key()
+    from langsmith import Client
+    client = Client()
+    all_issues = []
+    # Validate dataset
+    dataset_issues, example_count = validate_dataset(client, config)
+    all_issues.extend(dataset_issues)
+    # Validate best experiment
+    experiment_issues = validate_best_experiment(client, config)
+    all_issues.extend(experiment_issues)
+    # Validate git state
+    git_issues = validate_git_state(config)
+    all_issues.extend(git_issues)
+    # Check history consistency
+    history = config.get("history", [])
+    if history:
+        last = history[-1]
+        if last.get("experiment") != config.get("best_experiment"):
+            best_score = config.get("best_score", 0)
+            last_score = last.get("score", 0)
+            if last_score >= best_score:
+                all_issues.append({
+                    "field": "history",
+                    "severity": "warning",
+                    "message": f"Last history entry ({last['experiment']}) differs from best_experiment ({config.get('best_experiment')})",
+                })
+    # Auto-fix divergences if --fix flag is set
+    if args.fix:
+        fixed = []
+        for issue in all_issues:
+            if issue.get("field") == "dataset_id" and issue.get("severity") == "warning":
+                try:
+                    dataset = client.read_dataset(dataset_name=config["dataset"])
+                    config["dataset_id"] = str(dataset.id)
+                    with open(args.config, "w") as f:
+                        json.dump(config, f, indent=2)
+                    fixed.append(f"Fixed dataset_id: updated to {dataset.id}")
+                    issue["severity"] = "fixed"
+                except Exception:
+                    pass
+            elif issue.get("field") == "history" and issue.get("severity") == "warning":
+                history = config.get("history", [])
+                if history:
+                    best_in_history = max(history, key=lambda h: h.get("score", 0))
+                    config["best_experiment"] = best_in_history["experiment"]
+                    config["best_score"] = best_in_history["score"]
+                    with open(args.config, "w") as f:
+                        json.dump(config, f, indent=2)
+                    fixed.append(f"Fixed best_experiment: set to {best_in_history['experiment']}")
+                    issue["severity"] = "fixed"
+        if fixed:
+            print(f"Auto-fixed {len(fixed)} issues:", file=sys.stderr)
+            for f_msg in fixed:
+                print(f"  {f_msg}", file=sys.stderr)
+    all_issues = [i for i in all_issues if i.get("severity") != "fixed"]
+    critical = [i for i in all_issues if i.get("severity") == "critical"]
+    result = {
+        "valid": len(critical) == 0,
+        "issues": all_issues,
+        "dataset_examples": example_count,
+        "config_iterations": config.get("iterations", 0),
+        "config_best_score": config.get("best_score", 0),
+    }
+    output = json.dumps(result, indent=2)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(output)
+    print(output)
+    if critical:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/tools/__pycache__/detect_stack.cpython-314.pyc DELETED Viewed

Binary file

package/tools/__pycache__/trace_logger.cpython-314.pyc DELETED Viewed

Binary file