harness-evolver 3.3.1 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,205 @@
1
+ #!/usr/bin/env python3
2
+ """Inject adversarial examples into LangSmith dataset.
3
+
4
+ Detects potential memorization by checking if agent outputs are suspiciously
5
+ similar to reference outputs, then generates adversarial variations to test
6
+ generalization.
7
+
8
+ Usage:
9
+ python3 adversarial_inject.py \
10
+ --config .evolver.json \
11
+ --experiment v003a \
12
+ --output adversarial_report.json
13
+ """
14
+
15
+ import argparse
16
+ import json
17
+ import os
18
+ import platform
19
+ import sys
20
+ import random
21
+
22
+
23
+ def ensure_langsmith_api_key():
24
+ """Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
25
+ if os.environ.get("LANGSMITH_API_KEY"):
26
+ return True
27
+ if platform.system() == "Darwin":
28
+ creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
29
+ else:
30
+ creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
31
+ if os.path.exists(creds_path):
32
+ try:
33
+ with open(creds_path) as f:
34
+ for line in f:
35
+ line = line.strip()
36
+ if line.startswith("LANGSMITH_API_KEY="):
37
+ key = line.split("=", 1)[1].strip()
38
+ if key:
39
+ os.environ["LANGSMITH_API_KEY"] = key
40
+ return True
41
+ except OSError:
42
+ pass
43
+ if os.path.exists(".env"):
44
+ try:
45
+ with open(".env") as f:
46
+ for line in f:
47
+ line = line.strip()
48
+ if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
49
+ key = line.split("=", 1)[1].strip().strip("'\"")
50
+ if key:
51
+ os.environ["LANGSMITH_API_KEY"] = key
52
+ return True
53
+ except OSError:
54
+ pass
55
+ return False
56
+
57
+
58
+ def detect_memorization(client, experiment_name, dataset_name):
59
+ """Check if agent outputs are suspiciously similar to reference outputs."""
60
+ suspicious = []
61
+ try:
62
+ runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
63
+ examples = {str(e.id): e for e in client.list_examples(dataset_name=dataset_name, limit=500)}
64
+
65
+ for run in runs:
66
+ if not run.reference_example_id:
67
+ continue
68
+ example = examples.get(str(run.reference_example_id))
69
+ if not example or not example.outputs:
70
+ continue
71
+
72
+ run_output = str(run.outputs or "").lower().strip()
73
+ ref_output = str(example.outputs).lower().strip()
74
+
75
+ if not run_output or not ref_output:
76
+ continue
77
+
78
+ if run_output == ref_output:
79
+ suspicious.append({
80
+ "example_id": str(run.reference_example_id),
81
+ "match_type": "exact",
82
+ "input": str(run.inputs)[:200],
83
+ })
84
+ elif len(run_output) > 50 and ref_output in run_output:
85
+ suspicious.append({
86
+ "example_id": str(run.reference_example_id),
87
+ "match_type": "contains_reference",
88
+ "input": str(run.inputs)[:200],
89
+ })
90
+
91
+ except Exception as e:
92
+ print(f"Error checking memorization: {e}", file=sys.stderr)
93
+
94
+ return suspicious
95
+
96
+
97
+ def generate_adversarial_inputs(client, dataset_name, num_inputs=5):
98
+ """Generate adversarial variations of existing examples.
99
+
100
+ Creates multiple variation types to test generalization:
101
+ - negation: inverts the question to test if the agent distinguishes
102
+ - constraint: adds a constraint that changes the expected answer
103
+ - ambiguous: makes the input ambiguous to test robustness
104
+ - partial: provides incomplete input to test graceful handling
105
+ """
106
+ examples = list(client.list_examples(dataset_name=dataset_name, limit=100))
107
+ if not examples:
108
+ return []
109
+
110
+ adversarial = []
111
+ sampled = random.sample(examples, min(num_inputs, len(examples)))
112
+
113
+ variation_types = [
114
+ ("negation", "What is NOT the case: {input}"),
115
+ ("constraint", "{input} Answer in exactly one sentence."),
116
+ ("ambiguous", "Someone asked something like: {input}"),
117
+ ("partial", "{partial_input}"),
118
+ ]
119
+
120
+ for example in sampled:
121
+ input_data = example.inputs or {}
122
+ input_text = str(input_data.get("input", input_data))
123
+
124
+ # Pick a variation type (rotate through them)
125
+ idx = sampled.index(example) % len(variation_types)
126
+ vtype, template = variation_types[idx]
127
+
128
+ if vtype == "partial":
129
+ # Use first half of the input
130
+ words = input_text.split()
131
+ partial = " ".join(words[:max(len(words) // 2, 3)])
132
+ varied_input = template.format(partial_input=partial)
133
+ else:
134
+ varied_input = template.format(input=input_text)
135
+
136
+ adversarial.append({
137
+ "inputs": {"input": varied_input},
138
+ "metadata": {
139
+ "source": "adversarial",
140
+ "original_example_id": str(example.id),
141
+ "variation_type": vtype,
142
+ },
143
+ })
144
+
145
+ return adversarial
146
+
147
+
148
+ def inject_adversarial(client, dataset_id, adversarial_inputs):
149
+ """Add adversarial examples to dataset."""
150
+ added = 0
151
+ for adv in adversarial_inputs:
152
+ try:
153
+ client.create_example(
154
+ inputs=adv["inputs"],
155
+ dataset_id=dataset_id,
156
+ metadata=adv["metadata"],
157
+ )
158
+ added += 1
159
+ except Exception as e:
160
+ print(f"Failed to inject: {e}", file=sys.stderr)
161
+ return added
162
+
163
+
164
+ def main():
165
+ parser = argparse.ArgumentParser(description="Adversarial injection for evaluators")
166
+ parser.add_argument("--config", default=".evolver.json")
167
+ parser.add_argument("--experiment", required=True, help="Experiment to check for memorization")
168
+ parser.add_argument("--output", default=None, help="Output report path")
169
+ parser.add_argument("--inject", action="store_true", help="Actually inject adversarial examples")
170
+ parser.add_argument("--num-adversarial", type=int, default=5, help="Number of adversarial examples")
171
+ args = parser.parse_args()
172
+
173
+ with open(args.config) as f:
174
+ config = json.load(f)
175
+
176
+ ensure_langsmith_api_key()
177
+ from langsmith import Client
178
+ client = Client()
179
+
180
+ suspicious = detect_memorization(client, args.experiment, config["dataset"])
181
+ adversarial = generate_adversarial_inputs(client, config["dataset"], args.num_adversarial)
182
+
183
+ injected = 0
184
+ if args.inject and adversarial:
185
+ injected = inject_adversarial(client, config["dataset_id"], adversarial)
186
+
187
+ result = {
188
+ "memorization_suspects": len(suspicious),
189
+ "suspicious_examples": suspicious,
190
+ "adversarial_generated": len(adversarial),
191
+ "adversarial_injected": injected,
192
+ }
193
+
194
+ output = json.dumps(result, indent=2)
195
+ if args.output:
196
+ with open(args.output, "w") as f:
197
+ f.write(output)
198
+ print(output)
199
+
200
+ if suspicious:
201
+ print(f"\nWARNING: {len(suspicious)} examples show potential memorization!", file=sys.stderr)
202
+
203
+
204
+ if __name__ == "__main__":
205
+ main()
@@ -0,0 +1,235 @@
1
+ #!/usr/bin/env python3
2
+ """Cross-iteration memory consolidation for Harness Evolver.
3
+
4
+ Inspired by Claude Code's autoDream pattern. Analyzes evolution history
5
+ to identify recurring patterns, successful strategies, and wasted approaches.
6
+ Produces evolution_memory.md for proposer briefings.
7
+
8
+ Usage:
9
+ python3 consolidate.py --config .evolver.json --output evolution_memory.md
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import os
15
+ import sys
16
+ from datetime import datetime, timezone
17
+
18
+
19
+ def orient(config):
20
+ """Phase 1: Scan current state and history."""
21
+ history = config.get("history", [])
22
+ iterations = config.get("iterations", 0)
23
+ best_score = config.get("best_score", 0)
24
+ baseline_score = history[0]["score"] if history else 0
25
+
26
+ return {
27
+ "iterations": iterations,
28
+ "best_score": best_score,
29
+ "baseline_score": baseline_score,
30
+ "improvement": best_score - baseline_score,
31
+ "history": history,
32
+ }
33
+
34
+
35
+ def gather(config, comparison_files):
36
+ """Phase 2: Extract signals from trace insights and comparisons."""
37
+ signals = {
38
+ "winning_strategies": [],
39
+ "losing_strategies": [],
40
+ "recurring_failures": {},
41
+ "score_deltas": [],
42
+ }
43
+
44
+ for comp_file in comparison_files:
45
+ if not os.path.exists(comp_file):
46
+ continue
47
+ try:
48
+ with open(comp_file) as f:
49
+ data = json.load(f)
50
+ comparison = data.get("comparison", data)
51
+
52
+ winner = comparison.get("winner", {})
53
+ if winner:
54
+ signals["winning_strategies"].append({
55
+ "experiment": winner.get("experiment", ""),
56
+ "score": winner.get("score", 0),
57
+ })
58
+
59
+ for candidate in comparison.get("all_candidates", []):
60
+ if candidate.get("experiment") != winner.get("experiment"):
61
+ signals["losing_strategies"].append({
62
+ "experiment": candidate.get("experiment", ""),
63
+ "score": candidate.get("score", 0),
64
+ })
65
+ except (json.JSONDecodeError, OSError):
66
+ continue
67
+
68
+ # Compute score deltas from history
69
+ history = config.get("history", [])
70
+ for i in range(1, len(history)):
71
+ signals["score_deltas"].append({
72
+ "version": history[i]["version"],
73
+ "delta": history[i]["score"] - history[i - 1]["score"],
74
+ "score": history[i]["score"],
75
+ })
76
+
77
+ # Read trace insights for recurring patterns
78
+ if os.path.exists("trace_insights.json"):
79
+ try:
80
+ with open("trace_insights.json") as f:
81
+ insights = json.load(f)
82
+ for issue in insights.get("top_issues", []):
83
+ pattern = issue.get("pattern", issue.get("description", "unknown"))
84
+ if pattern not in signals["recurring_failures"]:
85
+ signals["recurring_failures"][pattern] = 0
86
+ signals["recurring_failures"][pattern] += 1
87
+ except (json.JSONDecodeError, OSError):
88
+ pass
89
+
90
+ return signals
91
+
92
+
93
+ def consolidate(orientation, signals, existing_memory=None):
94
+ """Phase 3: Merge signals into consolidated memory."""
95
+ insights = []
96
+
97
+ # Strategy effectiveness
98
+ winning = signals.get("winning_strategies", [])
99
+ strategy_map = {"a": "exploit", "b": "explore", "c": "crossover", "d": "failure-targeted-1", "e": "failure-targeted-2"}
100
+ win_counts = {}
101
+ for w in winning:
102
+ exp = w.get("experiment", "")
103
+ if exp:
104
+ suffix = exp[-1]
105
+ name = strategy_map.get(suffix, suffix)
106
+ win_counts[name] = win_counts.get(name, 0) + 1
107
+
108
+ if win_counts:
109
+ best_strategy = max(win_counts, key=win_counts.get)
110
+ insights.append({
111
+ "type": "strategy_effectiveness",
112
+ "insight": f"Most winning strategy: {best_strategy} ({win_counts[best_strategy]} wins)",
113
+ "recurrence": win_counts[best_strategy],
114
+ "data": win_counts,
115
+ })
116
+
117
+ # Recurring failures (only promote if seen 2+ times)
118
+ recurring = {k: v for k, v in signals.get("recurring_failures", {}).items() if v >= 2}
119
+ for pattern, count in sorted(recurring.items(), key=lambda x: -x[1]):
120
+ insights.append({
121
+ "type": "recurring_failure",
122
+ "insight": f"Recurring failure ({count}x): {pattern}",
123
+ "recurrence": count,
124
+ })
125
+
126
+ # Score trajectory
127
+ deltas = signals.get("score_deltas", [])
128
+ if deltas:
129
+ positive = [d for d in deltas if d["delta"] > 0]
130
+ negative = [d for d in deltas if d["delta"] < 0]
131
+ stagnant = [d for d in deltas if abs(d["delta"]) < 0.01]
132
+ insights.append({
133
+ "type": "trajectory",
134
+ "insight": f"Score trajectory: {len(positive)} improvements, {len(negative)} regressions, {len(stagnant)} stagnant",
135
+ "recurrence": len(deltas),
136
+ })
137
+
138
+ # Merge with existing memory (update recurrence counts)
139
+ if existing_memory:
140
+ for existing in existing_memory.get("insights", []):
141
+ found = False
142
+ for new in insights:
143
+ if new["type"] == existing["type"] and new["insight"] == existing["insight"]:
144
+ new["recurrence"] = max(new["recurrence"], existing.get("recurrence", 1)) + 1
145
+ found = True
146
+ break
147
+ if not found and existing.get("recurrence", 1) >= 2:
148
+ insights.append(existing)
149
+
150
+ return insights
151
+
152
+
153
+ def prune(insights, max_insights=20):
154
+ """Phase 4: Cap size, remove stale entries."""
155
+ sorted_insights = sorted(insights, key=lambda x: -x.get("recurrence", 1))
156
+ return sorted_insights[:max_insights]
157
+
158
+
159
+ def format_memory(orientation, insights):
160
+ """Format consolidated memory as markdown."""
161
+ lines = [
162
+ "# Evolution Memory",
163
+ "",
164
+ f"*Last updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*",
165
+ f"*Iterations: {orientation['iterations']} | Best: {orientation['best_score']:.3f} | Baseline: {orientation['baseline_score']:.3f} | Improvement: +{orientation['improvement']:.3f}*",
166
+ "",
167
+ "## Key Insights (promoted after 2+ recurrences)",
168
+ "",
169
+ ]
170
+
171
+ promoted = [i for i in insights if i.get("recurrence", 1) >= 2]
172
+ other = [i for i in insights if i.get("recurrence", 1) < 2]
173
+
174
+ if promoted:
175
+ for insight in promoted:
176
+ lines.append(f"- **[{insight['type']}]** {insight['insight']} (seen {insight['recurrence']}x)")
177
+ else:
178
+ lines.append("- No insights promoted yet (need 2+ recurrences)")
179
+
180
+ if other:
181
+ lines.append("")
182
+ lines.append("## Observations (1 recurrence, pending promotion)")
183
+ lines.append("")
184
+ for insight in other:
185
+ lines.append(f"- [{insight['type']}] {insight['insight']}")
186
+
187
+ lines.append("")
188
+ return "\n".join(lines)
189
+
190
+
191
+ def main():
192
+ parser = argparse.ArgumentParser(description="Cross-iteration memory consolidation")
193
+ parser.add_argument("--config", default=".evolver.json")
194
+ parser.add_argument("--output", default="evolution_memory.md", help="Output markdown path")
195
+ parser.add_argument("--output-json", default="evolution_memory.json", help="Output JSON path")
196
+ parser.add_argument("--comparison-files", nargs="*", default=[], help="Past comparison.json files")
197
+ args = parser.parse_args()
198
+
199
+ with open(args.config) as f:
200
+ config = json.load(f)
201
+
202
+ # Load existing memory if present
203
+ existing = None
204
+ if os.path.exists(args.output_json):
205
+ try:
206
+ with open(args.output_json) as f:
207
+ existing = json.load(f)
208
+ except (json.JSONDecodeError, OSError):
209
+ pass
210
+
211
+ # Four-phase consolidation
212
+ orientation = orient(config)
213
+ signals = gather(config, args.comparison_files or ["comparison.json"])
214
+ insights = consolidate(orientation, signals, existing)
215
+ insights = prune(insights)
216
+
217
+ # Write markdown
218
+ memory_md = format_memory(orientation, insights)
219
+ with open(args.output, "w") as f:
220
+ f.write(memory_md)
221
+
222
+ # Write JSON for programmatic access
223
+ memory_json = {
224
+ "updated_at": datetime.now(timezone.utc).isoformat(),
225
+ "orientation": orientation,
226
+ "insights": insights,
227
+ }
228
+ with open(args.output_json, "w") as f:
229
+ json.dump(memory_json, f, indent=2)
230
+
231
+ print(memory_md)
232
+
233
+
234
+ if __name__ == "__main__":
235
+ main()
@@ -0,0 +1,140 @@
1
+ #!/usr/bin/env python3
2
+ """Three-gate iteration trigger for Harness Evolver.
3
+
4
+ Evaluates whether the next evolution iteration should proceed based on:
5
+ 1. Score gate: skip if no meaningful delta or no clustered failures
6
+ 2. Cost gate: estimate token cost, stop if budget exceeded
7
+ 3. Convergence gate: detect statistical plateau
8
+
9
+ Usage:
10
+ python3 iteration_gate.py --config .evolver.json --output gate_result.json
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import sys
17
+ from datetime import datetime, timezone
18
+
19
+
20
+ def score_gate(config, threshold=0.02):
21
+ """Check if there's meaningful room for improvement."""
22
+ history = config.get("history", [])
23
+ if len(history) < 2:
24
+ return {"pass": True, "reason": "Not enough history to evaluate"}
25
+
26
+ recent = [h["score"] for h in history[-3:]]
27
+ best = config.get("best_score", 0)
28
+ target = config.get("target_score")
29
+
30
+ if target and best >= target:
31
+ return {"pass": False, "reason": f"Target reached: {best:.3f} >= {target}"}
32
+
33
+ if len(recent) >= 3:
34
+ score_range = max(recent) - min(recent)
35
+ if score_range < threshold:
36
+ return {
37
+ "pass": False,
38
+ "reason": f"Plateau detected: last 3 scores within {score_range:.4f} (threshold: {threshold})",
39
+ "suggest": "architect",
40
+ }
41
+
42
+ return {"pass": True, "reason": f"Score delta exists: range={max(recent)-min(recent):.4f}"}
43
+
44
+
45
+ def cost_gate(config, budget_tokens=None):
46
+ """Estimate cost of next iteration and check against budget."""
47
+ history = config.get("history", [])
48
+ iterations = config.get("iterations", 0)
49
+ estimated_cost = config.get("iteration_costs", {})
50
+
51
+ if not estimated_cost and iterations == 0:
52
+ return {"pass": True, "reason": "First iteration, no cost data yet"}
53
+
54
+ total_spent = sum(estimated_cost.get("per_iteration", [0]))
55
+ budget = budget_tokens or estimated_cost.get("budget_tokens")
56
+
57
+ if not budget:
58
+ return {"pass": True, "reason": "No budget configured"}
59
+
60
+ avg_cost = total_spent / max(iterations, 1)
61
+ remaining = budget - total_spent
62
+
63
+ if remaining < avg_cost * 0.5:
64
+ return {
65
+ "pass": False,
66
+ "reason": f"Budget nearly exhausted: {remaining:,} tokens remaining, avg iteration costs {avg_cost:,.0f}",
67
+ }
68
+
69
+ return {"pass": True, "reason": f"Budget OK: {remaining:,} tokens remaining"}
70
+
71
+
72
+ def convergence_gate(config, min_improvement=0.005, lookback=5):
73
+ """Detect statistical convergence using diminishing returns."""
74
+ history = config.get("history", [])
75
+ if len(history) < 3:
76
+ return {"pass": True, "reason": "Not enough iterations for convergence analysis"}
77
+
78
+ recent = history[-lookback:] if len(history) >= lookback else history
79
+ deltas = []
80
+ for i in range(1, len(recent)):
81
+ deltas.append(recent[i]["score"] - recent[i - 1]["score"])
82
+
83
+ if not deltas:
84
+ return {"pass": True, "reason": "No deltas to analyze"}
85
+
86
+ avg_delta = sum(deltas) / len(deltas)
87
+ positive_deltas = [d for d in deltas if d > 0]
88
+ improvement_rate = len(positive_deltas) / len(deltas)
89
+
90
+ if avg_delta < min_improvement and improvement_rate < 0.4:
91
+ return {
92
+ "pass": False,
93
+ "reason": f"Converged: avg delta={avg_delta:.4f}, improvement rate={improvement_rate:.0%}",
94
+ "suggest": "architect" if improvement_rate < 0.2 else "continue_cautious",
95
+ }
96
+
97
+ return {
98
+ "pass": True,
99
+ "reason": f"Still improving: avg delta={avg_delta:.4f}, improvement rate={improvement_rate:.0%}",
100
+ }
101
+
102
+
103
+ def main():
104
+ parser = argparse.ArgumentParser(description="Three-gate iteration trigger")
105
+ parser.add_argument("--config", default=".evolver.json", help="Config path")
106
+ parser.add_argument("--output", default=None, help="Output JSON path")
107
+ parser.add_argument("--score-threshold", type=float, default=0.02, help="Score plateau threshold")
108
+ parser.add_argument("--budget-tokens", type=int, default=None, help="Token budget override")
109
+ args = parser.parse_args()
110
+
111
+ with open(args.config) as f:
112
+ config = json.load(f)
113
+
114
+ gates = {
115
+ "score": score_gate(config, args.score_threshold),
116
+ "cost": cost_gate(config, args.budget_tokens),
117
+ "convergence": convergence_gate(config),
118
+ }
119
+
120
+ all_pass = all(g["pass"] for g in gates.values())
121
+ suggestions = [g.get("suggest") for g in gates.values() if g.get("suggest")]
122
+
123
+ result = {
124
+ "proceed": all_pass,
125
+ "gates": gates,
126
+ "suggestions": suggestions,
127
+ "timestamp": datetime.now(timezone.utc).isoformat(),
128
+ }
129
+
130
+ output = json.dumps(result, indent=2)
131
+ if args.output:
132
+ with open(args.output, "w") as f:
133
+ f.write(output)
134
+ print(output)
135
+
136
+ sys.exit(0 if all_pass else 1)
137
+
138
+
139
+ if __name__ == "__main__":
140
+ main()