harness-evolver 3.3.1 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/env python3
2
+ """Track regression examples across evolution iterations.
3
+
4
+ Compares per-example scores between consecutive iterations.
5
+ When an example transitions from failing (<0.5) to passing (>0.8),
6
+ adds a variation to the dataset as a regression guard.
7
+
8
+ Usage:
9
+ python3 regression_tracker.py \
10
+ --config .evolver.json \
11
+ --previous-experiment v001a \
12
+ --current-experiment v002c \
13
+ --output regression_report.json
14
+ """
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ import platform
20
+ import sys
21
+
22
+
23
+ def ensure_langsmith_api_key():
24
+ """Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
25
+ if os.environ.get("LANGSMITH_API_KEY"):
26
+ return True
27
+ if platform.system() == "Darwin":
28
+ creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
29
+ else:
30
+ creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
31
+ if os.path.exists(creds_path):
32
+ try:
33
+ with open(creds_path) as f:
34
+ for line in f:
35
+ line = line.strip()
36
+ if line.startswith("LANGSMITH_API_KEY="):
37
+ key = line.split("=", 1)[1].strip()
38
+ if key:
39
+ os.environ["LANGSMITH_API_KEY"] = key
40
+ return True
41
+ except OSError:
42
+ pass
43
+ if os.path.exists(".env"):
44
+ try:
45
+ with open(".env") as f:
46
+ for line in f:
47
+ line = line.strip()
48
+ if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
49
+ key = line.split("=", 1)[1].strip().strip("'\"")
50
+ if key:
51
+ os.environ["LANGSMITH_API_KEY"] = key
52
+ return True
53
+ except OSError:
54
+ pass
55
+ return False
56
+
57
+
58
+ def get_per_example_scores(client, experiment_name):
59
+ """Get per-example scores from an experiment."""
60
+ scores = {}
61
+ try:
62
+ runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
63
+ for run in runs:
64
+ example_id = str(run.reference_example_id or run.id)
65
+ feedbacks = list(client.list_feedback(run_ids=[run.id]))
66
+ fb_scores = {}
67
+ for fb in feedbacks:
68
+ if fb.score is not None:
69
+ fb_scores[fb.key] = fb.score
70
+ avg = sum(fb_scores.values()) / len(fb_scores) if fb_scores else 0.0
71
+ scores[example_id] = {
72
+ "score": avg,
73
+ "input": str(run.inputs)[:500] if run.inputs else "",
74
+ "output": str(run.outputs)[:500] if run.outputs else "",
75
+ }
76
+ except Exception as e:
77
+ print(f"Error reading {experiment_name}: {e}", file=sys.stderr)
78
+ return scores
79
+
80
+
81
+ def find_transitions(prev_scores, curr_scores, fail_threshold=0.5, pass_threshold=0.8):
82
+ """Find examples that transitioned from failing to passing."""
83
+ transitions = []
84
+ regressions = []
85
+
86
+ for example_id in set(prev_scores) & set(curr_scores):
87
+ prev = prev_scores[example_id]["score"]
88
+ curr = curr_scores[example_id]["score"]
89
+
90
+ if prev < fail_threshold and curr >= pass_threshold:
91
+ transitions.append({
92
+ "example_id": example_id,
93
+ "prev_score": prev,
94
+ "curr_score": curr,
95
+ "type": "fixed",
96
+ "input": curr_scores[example_id]["input"],
97
+ })
98
+ elif prev >= pass_threshold and curr < fail_threshold:
99
+ regressions.append({
100
+ "example_id": example_id,
101
+ "prev_score": prev,
102
+ "curr_score": curr,
103
+ "type": "regressed",
104
+ "input": curr_scores[example_id]["input"],
105
+ })
106
+
107
+ return transitions, regressions
108
+
109
+
110
+ def add_regression_guards(client, dataset_id, transitions, max_guards=5):
111
+ """Add regression guard examples to the dataset."""
112
+ added = 0
113
+ for t in transitions[:max_guards]:
114
+ try:
115
+ input_data = json.loads(t["input"]) if t["input"].startswith("{") else {"input": t["input"]}
116
+ client.create_example(
117
+ inputs=input_data,
118
+ dataset_id=dataset_id,
119
+ metadata={"source": "regression_guard", "original_example_id": t["example_id"]},
120
+ )
121
+ added += 1
122
+ except Exception as e:
123
+ print(f"Failed to add guard for {t['example_id']}: {e}", file=sys.stderr)
124
+ return added
125
+
126
+
127
+ def main():
128
+ parser = argparse.ArgumentParser(description="Track regressions across iterations")
129
+ parser.add_argument("--config", default=".evolver.json")
130
+ parser.add_argument("--previous-experiment", required=True, help="Previous iteration experiment name")
131
+ parser.add_argument("--current-experiment", required=True, help="Current iteration experiment name")
132
+ parser.add_argument("--output", default=None, help="Output JSON report")
133
+ parser.add_argument("--add-guards", action="store_true", help="Add regression guard examples to dataset")
134
+ parser.add_argument("--max-guards", type=int, default=5, help="Max guard examples to add")
135
+ args = parser.parse_args()
136
+
137
+ with open(args.config) as f:
138
+ config = json.load(f)
139
+
140
+ ensure_langsmith_api_key()
141
+ from langsmith import Client
142
+ client = Client()
143
+
144
+ prev_scores = get_per_example_scores(client, args.previous_experiment)
145
+ curr_scores = get_per_example_scores(client, args.current_experiment)
146
+
147
+ transitions, regressions = find_transitions(prev_scores, curr_scores)
148
+
149
+ added = 0
150
+ if args.add_guards and transitions:
151
+ added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards)
152
+
153
+ result = {
154
+ "previous": args.previous_experiment,
155
+ "current": args.current_experiment,
156
+ "fixed_count": len(transitions),
157
+ "regression_count": len(regressions),
158
+ "guards_added": added,
159
+ "fixed": transitions,
160
+ "regressions": regressions,
161
+ }
162
+
163
+ output = json.dumps(result, indent=2)
164
+ if args.output:
165
+ with open(args.output, "w") as f:
166
+ f.write(output)
167
+ print(output)
168
+
169
+ if regressions:
170
+ print(f"\nWARNING: {len(regressions)} regressions detected!", file=sys.stderr)
171
+ sys.exit(1)
172
+
173
+
174
+ if __name__ == "__main__":
175
+ main()
@@ -0,0 +1,224 @@
1
+ #!/usr/bin/env python3
2
+ """Synthesize evolution strategy document from trace analysis.
3
+
4
+ Reads trace_insights.json, best_results.json, evolution_memory.json,
5
+ and production_seed.json to produce a targeted strategy document with
6
+ specific file paths and concrete change recommendations for proposers.
7
+
8
+ Usage:
9
+ python3 synthesize_strategy.py \
10
+ --config .evolver.json \
11
+ --trace-insights trace_insights.json \
12
+ --best-results best_results.json \
13
+ --evolution-memory evolution_memory.json \
14
+ --production-seed production_seed.json \
15
+ --output strategy.md
16
+ """
17
+
18
+ import argparse
19
+ import json
20
+ import os
21
+ import sys
22
+
23
+
24
+ def load_json_safe(path):
25
+ """Load JSON file, return None if missing or invalid."""
26
+ if not path or not os.path.exists(path):
27
+ return None
28
+ try:
29
+ with open(path) as f:
30
+ return json.load(f)
31
+ except (json.JSONDecodeError, OSError):
32
+ return None
33
+
34
+
35
+ def identify_target_files(config):
36
+ """Identify which files proposers should focus on."""
37
+ entry_point = config.get("entry_point", "")
38
+ parts = entry_point.split()
39
+ target_files = []
40
+ for part in parts:
41
+ if part.endswith(".py") and not part.startswith("-"):
42
+ target_files.append(part)
43
+ return target_files
44
+
45
+
46
+ def synthesize(config, insights, results, memory, production=None):
47
+ """Produce strategy recommendations."""
48
+ strategy = {
49
+ "primary_targets": [],
50
+ "failure_clusters": [],
51
+ "recommended_approaches": [],
52
+ "avoid": [],
53
+ }
54
+
55
+ strategy["primary_targets"] = identify_target_files(config)
56
+
57
+ if insights:
58
+ for issue in insights.get("top_issues", [])[:5]:
59
+ strategy["failure_clusters"].append({
60
+ "type": issue.get("type", "unknown"),
61
+ "severity": issue.get("severity", "medium"),
62
+ "description": issue.get("description", ""),
63
+ "count": issue.get("count", 0),
64
+ })
65
+
66
+ if memory:
67
+ for insight in memory.get("insights", []):
68
+ if insight.get("recurrence", 0) >= 2:
69
+ if insight["type"] == "strategy_effectiveness":
70
+ strategy["recommended_approaches"].append(insight["insight"])
71
+ elif insight["type"] == "recurring_failure":
72
+ strategy["failure_clusters"].append({
73
+ "type": "recurring",
74
+ "severity": "high",
75
+ "description": insight["insight"],
76
+ "count": insight["recurrence"],
77
+ })
78
+
79
+ if memory:
80
+ for insight in memory.get("insights", []):
81
+ if "losing" in insight.get("type", "") or "regression" in insight.get("type", ""):
82
+ strategy["avoid"].append(insight["insight"])
83
+
84
+ if results:
85
+ per_example = results.get("per_example", {})
86
+ failing = [(eid, data) for eid, data in per_example.items() if data.get("score", 0) < 0.5]
87
+ failing.sort(key=lambda x: x[1].get("score", 0))
88
+ strategy["failing_examples"] = [
89
+ {
90
+ "example_id": eid,
91
+ "score": data["score"],
92
+ "input_preview": data.get("input_preview", "")[:200],
93
+ "error": data.get("error"),
94
+ }
95
+ for eid, data in failing[:10]
96
+ ]
97
+
98
+ # Production trace data
99
+ if production:
100
+ prod_data = {}
101
+ stats = production.get("stats", {})
102
+ if stats:
103
+ prod_data["total_traces"] = stats.get("total_traces", 0)
104
+ prod_data["error_rate"] = stats.get("error_rate", 0)
105
+ categories = production.get("categories", [])
106
+ if categories:
107
+ prod_data["traffic_distribution"] = categories[:10]
108
+ neg = production.get("negative_feedback_inputs", [])
109
+ if neg:
110
+ prod_data["negative_feedback"] = neg[:5]
111
+ errors = production.get("error_patterns", production.get("errors", []))
112
+ if errors:
113
+ prod_data["production_errors"] = errors[:5] if isinstance(errors, list) else []
114
+ slow = production.get("slow_queries", [])
115
+ if slow:
116
+ prod_data["slow_queries"] = slow[:5]
117
+ if prod_data:
118
+ strategy["production"] = prod_data
119
+
120
+ return strategy
121
+
122
+
123
+ def format_strategy_md(strategy, config):
124
+ """Format strategy as markdown document."""
125
+ lines = [
126
+ "# Evolution Strategy Document",
127
+ "",
128
+ f"*Framework: {config.get('framework', 'unknown')} | Entry point: {config.get('entry_point', 'N/A')}*",
129
+ "",
130
+ ]
131
+
132
+ lines.append("## Target Files")
133
+ for f in strategy.get("primary_targets", []):
134
+ lines.append(f"- `{f}`")
135
+ lines.append("")
136
+
137
+ clusters = strategy.get("failure_clusters", [])
138
+ if clusters:
139
+ lines.append("## Failure Clusters (prioritized)")
140
+ for i, c in enumerate(clusters, 1):
141
+ lines.append(f"{i}. **[{c['severity']}]** {c['description']} (count: {c['count']})")
142
+ lines.append("")
143
+
144
+ approaches = strategy.get("recommended_approaches", [])
145
+ if approaches:
146
+ lines.append("## Recommended Approaches (from evolution memory)")
147
+ for a in approaches:
148
+ lines.append(f"- {a}")
149
+ lines.append("")
150
+
151
+ avoid = strategy.get("avoid", [])
152
+ if avoid:
153
+ lines.append("## Avoid (previously unsuccessful)")
154
+ for a in avoid:
155
+ lines.append(f"- {a}")
156
+ lines.append("")
157
+
158
+ failing = strategy.get("failing_examples", [])
159
+ if failing:
160
+ lines.append(f"## Top Failing Examples ({len(failing)})")
161
+ for ex in failing:
162
+ score = ex["score"]
163
+ preview = ex["input_preview"][:100]
164
+ error = f" — Error: {ex['error'][:80]}" if ex.get("error") else ""
165
+ lines.append(f"- `{ex['example_id']}` (score: {score:.2f}): {preview}{error}")
166
+ lines.append("")
167
+
168
+ prod = strategy.get("production", {})
169
+ if prod:
170
+ lines.append("## Production Insights")
171
+ if prod.get("total_traces"):
172
+ lines.append(f"- **Traces**: {prod['total_traces']} total, {prod.get('error_rate', 0):.1%} error rate")
173
+ if prod.get("traffic_distribution"):
174
+ lines.append(f"- **Traffic**: {', '.join(str(c) for c in prod['traffic_distribution'][:5])}")
175
+ if prod.get("negative_feedback"):
176
+ lines.append("- **Negative feedback inputs**:")
177
+ for nf in prod["negative_feedback"]:
178
+ lines.append(f" - {str(nf)[:120]}")
179
+ if prod.get("production_errors"):
180
+ lines.append("- **Production errors**:")
181
+ for pe in prod["production_errors"]:
182
+ lines.append(f" - {str(pe)[:120]}")
183
+ if prod.get("slow_queries"):
184
+ lines.append("- **Slow queries**:")
185
+ for sq in prod["slow_queries"]:
186
+ lines.append(f" - {str(sq)[:120]}")
187
+ lines.append("")
188
+
189
+ return "\n".join(lines)
190
+
191
+
192
+ def main():
193
+ parser = argparse.ArgumentParser(description="Synthesize evolution strategy")
194
+ parser.add_argument("--config", default=".evolver.json")
195
+ parser.add_argument("--trace-insights", default="trace_insights.json")
196
+ parser.add_argument("--best-results", default="best_results.json")
197
+ parser.add_argument("--evolution-memory", default="evolution_memory.json")
198
+ parser.add_argument("--production-seed", default="production_seed.json")
199
+ parser.add_argument("--output", default="strategy.md")
200
+ args = parser.parse_args()
201
+
202
+ with open(args.config) as f:
203
+ config = json.load(f)
204
+
205
+ insights = load_json_safe(args.trace_insights)
206
+ results = load_json_safe(args.best_results)
207
+ memory = load_json_safe(args.evolution_memory)
208
+ production = load_json_safe(args.production_seed)
209
+
210
+ strategy = synthesize(config, insights, results, memory, production)
211
+
212
+ md = format_strategy_md(strategy, config)
213
+ with open(args.output, "w") as f:
214
+ f.write(md)
215
+
216
+ json_path = args.output.replace(".md", ".json")
217
+ with open(json_path, "w") as f:
218
+ json.dump(strategy, f, indent=2)
219
+
220
+ print(md)
221
+
222
+
223
+ if __name__ == "__main__":
224
+ main()
@@ -0,0 +1,212 @@
1
+ #!/usr/bin/env python3
2
+ """Validate .evolver.json state against LangSmith reality.
3
+
4
+ Checks that referenced experiments, datasets, and projects still exist.
5
+ Returns JSON with validation results and any divergences found.
6
+
7
+ Usage:
8
+ python3 validate_state.py --config .evolver.json --output validation.json
9
+ """
10
+
11
+ import argparse
12
+ import json
13
+ import os
14
+ import platform
15
+ import sys
16
+
17
+
18
+ def ensure_langsmith_api_key():
19
+ """Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
20
+ if os.environ.get("LANGSMITH_API_KEY"):
21
+ return True
22
+ if platform.system() == "Darwin":
23
+ creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
24
+ else:
25
+ creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
26
+ if os.path.exists(creds_path):
27
+ try:
28
+ with open(creds_path) as f:
29
+ for line in f:
30
+ line = line.strip()
31
+ if line.startswith("LANGSMITH_API_KEY="):
32
+ key = line.split("=", 1)[1].strip()
33
+ if key:
34
+ os.environ["LANGSMITH_API_KEY"] = key
35
+ return True
36
+ except OSError:
37
+ pass
38
+ if os.path.exists(".env"):
39
+ try:
40
+ with open(".env") as f:
41
+ for line in f:
42
+ line = line.strip()
43
+ if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
44
+ key = line.split("=", 1)[1].strip().strip("'\"")
45
+ if key:
46
+ os.environ["LANGSMITH_API_KEY"] = key
47
+ return True
48
+ except OSError:
49
+ pass
50
+ return False
51
+
52
+
53
+ def validate_dataset(client, config):
54
+ """Check dataset exists and has expected example count."""
55
+ issues = []
56
+ dataset_name = config.get("dataset")
57
+ dataset_id = config.get("dataset_id")
58
+ if not dataset_name:
59
+ issues.append({"field": "dataset", "severity": "critical", "message": "No dataset configured"})
60
+ return issues, 0
61
+ try:
62
+ dataset = client.read_dataset(dataset_name=dataset_name)
63
+ if dataset_id and str(dataset.id) != dataset_id:
64
+ issues.append({
65
+ "field": "dataset_id",
66
+ "severity": "warning",
67
+ "message": f"dataset_id mismatch: config has {dataset_id}, LangSmith has {dataset.id}",
68
+ })
69
+ count = len(list(client.list_examples(dataset_id=dataset.id, limit=500)))
70
+ return issues, count
71
+ except Exception as e:
72
+ issues.append({"field": "dataset", "severity": "critical", "message": f"Dataset not found: {e}"})
73
+ return issues, 0
74
+
75
+
76
+ def validate_best_experiment(client, config):
77
+ """Check best_experiment still exists and score matches."""
78
+ issues = []
79
+ best = config.get("best_experiment")
80
+ if not best:
81
+ return issues
82
+ try:
83
+ runs = list(client.list_runs(project_name=best, is_root=True, limit=1))
84
+ if not runs:
85
+ issues.append({
86
+ "field": "best_experiment",
87
+ "severity": "critical",
88
+ "message": f"Best experiment '{best}' has no runs in LangSmith",
89
+ })
90
+ except Exception as e:
91
+ issues.append({
92
+ "field": "best_experiment",
93
+ "severity": "critical",
94
+ "message": f"Best experiment '{best}' not accessible: {e}",
95
+ })
96
+ return issues
97
+
98
+
99
+ def validate_git_state(config):
100
+ """Check that current git HEAD matches expected state."""
101
+ import subprocess
102
+ issues = []
103
+ try:
104
+ result = subprocess.run(
105
+ ["git", "log", "--oneline", "-1"],
106
+ capture_output=True, text=True, timeout=10,
107
+ )
108
+ head = result.stdout.strip()
109
+ if not head:
110
+ issues.append({"field": "git", "severity": "warning", "message": "Could not read git HEAD"})
111
+ except Exception as e:
112
+ issues.append({"field": "git", "severity": "warning", "message": f"Git check failed: {e}"})
113
+ return issues
114
+
115
+
116
+ def main():
117
+ parser = argparse.ArgumentParser(description="Validate .evolver.json against LangSmith")
118
+ parser.add_argument("--config", default=".evolver.json", help="Config path")
119
+ parser.add_argument("--output", default=None, help="Output JSON path")
120
+ parser.add_argument("--fix", action="store_true", help="Auto-fix divergences where possible")
121
+ args = parser.parse_args()
122
+
123
+ if not os.path.exists(args.config):
124
+ print(json.dumps({"valid": False, "issues": [{"severity": "critical", "message": f"{args.config} not found"}]}))
125
+ sys.exit(1)
126
+
127
+ with open(args.config) as f:
128
+ config = json.load(f)
129
+
130
+ ensure_langsmith_api_key()
131
+ from langsmith import Client
132
+ client = Client()
133
+
134
+ all_issues = []
135
+
136
+ # Validate dataset
137
+ dataset_issues, example_count = validate_dataset(client, config)
138
+ all_issues.extend(dataset_issues)
139
+
140
+ # Validate best experiment
141
+ experiment_issues = validate_best_experiment(client, config)
142
+ all_issues.extend(experiment_issues)
143
+
144
+ # Validate git state
145
+ git_issues = validate_git_state(config)
146
+ all_issues.extend(git_issues)
147
+
148
+ # Check history consistency
149
+ history = config.get("history", [])
150
+ if history:
151
+ last = history[-1]
152
+ if last.get("experiment") != config.get("best_experiment"):
153
+ best_score = config.get("best_score", 0)
154
+ last_score = last.get("score", 0)
155
+ if last_score >= best_score:
156
+ all_issues.append({
157
+ "field": "history",
158
+ "severity": "warning",
159
+ "message": f"Last history entry ({last['experiment']}) differs from best_experiment ({config.get('best_experiment')})",
160
+ })
161
+
162
+ # Auto-fix divergences if --fix flag is set
163
+ if args.fix:
164
+ fixed = []
165
+ for issue in all_issues:
166
+ if issue.get("field") == "dataset_id" and issue.get("severity") == "warning":
167
+ try:
168
+ dataset = client.read_dataset(dataset_name=config["dataset"])
169
+ config["dataset_id"] = str(dataset.id)
170
+ with open(args.config, "w") as f:
171
+ json.dump(config, f, indent=2)
172
+ fixed.append(f"Fixed dataset_id: updated to {dataset.id}")
173
+ issue["severity"] = "fixed"
174
+ except Exception:
175
+ pass
176
+ elif issue.get("field") == "history" and issue.get("severity") == "warning":
177
+ history = config.get("history", [])
178
+ if history:
179
+ best_in_history = max(history, key=lambda h: h.get("score", 0))
180
+ config["best_experiment"] = best_in_history["experiment"]
181
+ config["best_score"] = best_in_history["score"]
182
+ with open(args.config, "w") as f:
183
+ json.dump(config, f, indent=2)
184
+ fixed.append(f"Fixed best_experiment: set to {best_in_history['experiment']}")
185
+ issue["severity"] = "fixed"
186
+ if fixed:
187
+ print(f"Auto-fixed {len(fixed)} issues:", file=sys.stderr)
188
+ for f_msg in fixed:
189
+ print(f" {f_msg}", file=sys.stderr)
190
+
191
+ all_issues = [i for i in all_issues if i.get("severity") != "fixed"]
192
+ critical = [i for i in all_issues if i.get("severity") == "critical"]
193
+ result = {
194
+ "valid": len(critical) == 0,
195
+ "issues": all_issues,
196
+ "dataset_examples": example_count,
197
+ "config_iterations": config.get("iterations", 0),
198
+ "config_best_score": config.get("best_score", 0),
199
+ }
200
+
201
+ output = json.dumps(result, indent=2)
202
+ if args.output:
203
+ with open(args.output, "w") as f:
204
+ f.write(output)
205
+ print(output)
206
+
207
+ if critical:
208
+ sys.exit(1)
209
+
210
+
211
+ if __name__ == "__main__":
212
+ main()