npm - harness-evolver - Versions diffs - 4.0.3 → 4.2.0 - Mend

harness-evolver 4.0.3 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/.claude-plugin/plugin.json +1 -1
package/README.md +11 -10
package/agents/evolver-proposer.md +45 -47
package/package.json +1 -1
package/skills/evolve/SKILL.md +173 -65
package/tools/__pycache__/adversarial_inject.cpython-313.pyc +0 -0
package/tools/__pycache__/regression_tracker.cpython-313.pyc +0 -0
package/tools/__pycache__/setup.cpython-313.pyc +0 -0
package/tools/adversarial_inject.py +8 -3
package/tools/consolidate.py +7 -15
package/tools/dataset_health.py +385 -0
package/tools/read_results.py +21 -2
package/tools/regression_tracker.py +17 -4
package/tools/setup.py +23 -0
package/tools/synthesize_strategy.py +138 -2
package/tools/trace_insights.py +7 -1

package/tools/synthesize_strategy.py CHANGED Viewed

@@ -1,10 +1,15 @@
 #!/usr/bin/env python3
-"""Synthesize evolution strategy document from trace analysis.
+"""Synthesize evolution strategy document and investigation lenses.
 Reads trace_insights.json, best_results.json, evolution_memory.json,
 and production_seed.json to produce a targeted strategy document with
 specific file paths and concrete change recommendations for proposers.
+When --lenses is specified, also generates a lenses.json file containing
+investigation questions derived from failure clusters, architecture issues,
+production data, and evolution memory. Each lens becomes a focused brief
+for one proposer agent.
 Usage:
     python3 synthesize_strategy.py \
         --config .evolver.json \
@@ -12,7 +17,8 @@ Usage:
         --best-results best_results.json \
         --evolution-memory evolution_memory.json \
         --production-seed production_seed.json \
-        --output strategy.md
+        --output strategy.md \
+        --lenses lenses.json
 """
 import argparse
@@ -120,6 +126,118 @@ def synthesize(config, insights, results, memory, production=None):
     return strategy
+def generate_lenses(strategy, config, insights, results, memory, production, max_lenses=5):
+    """Generate investigation lenses from available data sources."""
+    lenses = []
+    lens_id = 0
+    # Failure cluster lenses (one per distinct cluster, max 3)
+    for cluster in strategy.get("failure_clusters", [])[:3]:
+        lens_id += 1
+        desc = cluster["description"]
+        severity = cluster["severity"]
+        examples = []
+        for ex in strategy.get("failing_examples", []):
+            if ex.get("error") and cluster.get("type", "") in str(ex.get("error", "")):
+                examples.append(ex["example_id"])
+        if not examples:
+            examples = [ex["example_id"] for ex in strategy.get("failing_examples", [])[:3]]
+        lenses.append({
+            "id": lens_id,
+            "question": f"{desc} — what code change would fix this?",
+            "source": "failure_cluster",
+            "severity": severity,
+            "context": {"examples": examples[:5]},
+        })
+    # Architecture lens from trace insights
+    if insights:
+        for issue in insights.get("top_issues", []):
+            if issue.get("severity") == "high" and issue.get("type") in (
+                "architecture", "routing", "topology", "structure",
+            ):
+                lens_id += 1
+                lenses.append({
+                    "id": lens_id,
+                    "question": f"Architectural issue: {issue['description']} — what structural change would help?",
+                    "source": "architecture",
+                    "severity": "high",
+                    "context": {"issue_type": issue["type"]},
+                })
+                break  # at most 1 architecture lens
+    # Production lens
+    if production:
+        prod_issues = []
+        neg = production.get("negative_feedback_inputs", [])
+        if neg:
+            prod_issues.append(f"Users gave negative feedback on {len(neg)} queries")
+        errors = production.get("error_patterns", production.get("errors", []))
+        if errors and isinstance(errors, list) and len(errors) > 0:
+            prod_issues.append(f"Production errors: {str(errors[0])[:100]}")
+        slow = production.get("slow_queries", [])
+        if slow:
+            prod_issues.append(f"{len(slow)} slow queries detected")
+        if prod_issues:
+            lens_id += 1
+            lenses.append({
+                "id": lens_id,
+                "question": f"Production data shows: {'; '.join(prod_issues)}. How should the agent handle these real-world patterns?",
+                "source": "production",
+                "severity": "high",
+                "context": {},
+            })
+    # Evolution memory lens — winning patterns
+    if memory:
+        for insight in memory.get("insights", []):
+            if insight.get("type") == "strategy_effectiveness" and insight.get("recurrence", 0) >= 2:
+                lens_id += 1
+                lenses.append({
+                    "id": lens_id,
+                    "question": f"{insight['insight']} — what further improvements in this direction are possible?",
+                    "source": "evolution_memory",
+                    "severity": "medium",
+                    "context": {"recurrence": insight["recurrence"]},
+                })
+                break  # at most 1 memory lens
+    # Evolution memory lens — persistent failures
+    if memory:
+        for insight in memory.get("insights", []):
+            if insight.get("type") == "recurring_failure" and insight.get("recurrence", 0) >= 3:
+                lens_id += 1
+                lenses.append({
+                    "id": lens_id,
+                    "question": f"{insight['insight']} — this has persisted {insight['recurrence']} iterations. Why?",
+                    "source": "persistent_failure",
+                    "severity": "critical",
+                    "context": {"recurrence": insight["recurrence"]},
+                })
+                break  # at most 1 persistent failure lens
+    # Open lens (always included)
+    lens_id += 1
+    lenses.append({
+        "id": lens_id,
+        "question": "Open investigation — read all context and investigate what stands out most to you.",
+        "source": "open",
+        "severity": "medium",
+        "context": {},
+    })
+    # Sort by severity, take top max_lenses
+    severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
+    lenses.sort(key=lambda l: severity_order.get(l["severity"], 2))
+    lenses = lenses[:max_lenses]
+    # Reassign sequential IDs after sorting/truncating
+    for i, lens in enumerate(lenses):
+        lens["id"] = i + 1
+    return lenses
 def format_strategy_md(strategy, config):
     """Format strategy as markdown document."""
     lines = [
@@ -197,6 +315,7 @@ def main():
     parser.add_argument("--evolution-memory", default="evolution_memory.json")
     parser.add_argument("--production-seed", default="production_seed.json")
     parser.add_argument("--output", default="strategy.md")
+    parser.add_argument("--lenses", default=None, help="Output path for lenses JSON")
     args = parser.parse_args()
     with open(args.config) as f:
@@ -217,6 +336,23 @@ def main():
     with open(json_path, "w") as f:
         json.dump(strategy, f, indent=2)
+    # Generate lenses if requested
+    if args.lenses:
+        max_proposers = config.get("max_proposers", 5)
+        lens_list = generate_lenses(
+            strategy, config, insights, results, memory, production,
+            max_lenses=max_proposers,
+        )
+        from datetime import datetime, timezone
+        lenses_output = {
+            "generated_at": datetime.now(timezone.utc).isoformat(),
+            "lens_count": len(lens_list),
+            "lenses": lens_list,
+        }
+        with open(args.lenses, "w") as f:
+            json.dump(lenses_output, f, indent=2)
+        print(f"Generated {len(lens_list)} lenses → {args.lenses}", file=sys.stderr)
     print(md)

package/tools/trace_insights.py CHANGED Viewed

@@ -335,10 +335,16 @@ def fetch_scores_from_experiment(experiment_name):
             limit=200,
         ))
+        all_run_ids = [run.id for run in runs]
+        all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
+        fb_map = {}
+        for fb in all_feedbacks:
+            fb_map.setdefault(str(fb.run_id), []).append(fb)
         per_task = {}
         for run in runs:
             example_id = str(run.reference_example_id or run.id)
-            feedbacks = list(client.list_feedback(run_ids=[run.id]))
+            feedbacks = fb_map.get(str(run.id), [])
             scores = [fb.score for fb in feedbacks if fb.score is not None]
             avg_score = sum(scores) / len(scores) if scores else 0.0
             per_task[example_id] = {"score": avg_score}