harness-evolver 4.0.3 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +11 -10
- package/agents/evolver-proposer.md +45 -47
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +173 -65
- package/tools/__pycache__/adversarial_inject.cpython-313.pyc +0 -0
- package/tools/__pycache__/regression_tracker.cpython-313.pyc +0 -0
- package/tools/__pycache__/setup.cpython-313.pyc +0 -0
- package/tools/adversarial_inject.py +8 -3
- package/tools/consolidate.py +7 -15
- package/tools/dataset_health.py +385 -0
- package/tools/read_results.py +21 -2
- package/tools/regression_tracker.py +17 -4
- package/tools/setup.py +23 -0
- package/tools/synthesize_strategy.py +138 -2
- package/tools/trace_insights.py +7 -1
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""Synthesize evolution strategy document
|
|
2
|
+
"""Synthesize evolution strategy document and investigation lenses.
|
|
3
3
|
|
|
4
4
|
Reads trace_insights.json, best_results.json, evolution_memory.json,
|
|
5
5
|
and production_seed.json to produce a targeted strategy document with
|
|
6
6
|
specific file paths and concrete change recommendations for proposers.
|
|
7
7
|
|
|
8
|
+
When --lenses is specified, also generates a lenses.json file containing
|
|
9
|
+
investigation questions derived from failure clusters, architecture issues,
|
|
10
|
+
production data, and evolution memory. Each lens becomes a focused brief
|
|
11
|
+
for one proposer agent.
|
|
12
|
+
|
|
8
13
|
Usage:
|
|
9
14
|
python3 synthesize_strategy.py \
|
|
10
15
|
--config .evolver.json \
|
|
@@ -12,7 +17,8 @@ Usage:
|
|
|
12
17
|
--best-results best_results.json \
|
|
13
18
|
--evolution-memory evolution_memory.json \
|
|
14
19
|
--production-seed production_seed.json \
|
|
15
|
-
--output strategy.md
|
|
20
|
+
--output strategy.md \
|
|
21
|
+
--lenses lenses.json
|
|
16
22
|
"""
|
|
17
23
|
|
|
18
24
|
import argparse
|
|
@@ -120,6 +126,118 @@ def synthesize(config, insights, results, memory, production=None):
|
|
|
120
126
|
return strategy
|
|
121
127
|
|
|
122
128
|
|
|
129
|
+
def generate_lenses(strategy, config, insights, results, memory, production, max_lenses=5):
|
|
130
|
+
"""Generate investigation lenses from available data sources."""
|
|
131
|
+
lenses = []
|
|
132
|
+
lens_id = 0
|
|
133
|
+
|
|
134
|
+
# Failure cluster lenses (one per distinct cluster, max 3)
|
|
135
|
+
for cluster in strategy.get("failure_clusters", [])[:3]:
|
|
136
|
+
lens_id += 1
|
|
137
|
+
desc = cluster["description"]
|
|
138
|
+
severity = cluster["severity"]
|
|
139
|
+
examples = []
|
|
140
|
+
for ex in strategy.get("failing_examples", []):
|
|
141
|
+
if ex.get("error") and cluster.get("type", "") in str(ex.get("error", "")):
|
|
142
|
+
examples.append(ex["example_id"])
|
|
143
|
+
if not examples:
|
|
144
|
+
examples = [ex["example_id"] for ex in strategy.get("failing_examples", [])[:3]]
|
|
145
|
+
lenses.append({
|
|
146
|
+
"id": lens_id,
|
|
147
|
+
"question": f"{desc} — what code change would fix this?",
|
|
148
|
+
"source": "failure_cluster",
|
|
149
|
+
"severity": severity,
|
|
150
|
+
"context": {"examples": examples[:5]},
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
# Architecture lens from trace insights
|
|
154
|
+
if insights:
|
|
155
|
+
for issue in insights.get("top_issues", []):
|
|
156
|
+
if issue.get("severity") == "high" and issue.get("type") in (
|
|
157
|
+
"architecture", "routing", "topology", "structure",
|
|
158
|
+
):
|
|
159
|
+
lens_id += 1
|
|
160
|
+
lenses.append({
|
|
161
|
+
"id": lens_id,
|
|
162
|
+
"question": f"Architectural issue: {issue['description']} — what structural change would help?",
|
|
163
|
+
"source": "architecture",
|
|
164
|
+
"severity": "high",
|
|
165
|
+
"context": {"issue_type": issue["type"]},
|
|
166
|
+
})
|
|
167
|
+
break # at most 1 architecture lens
|
|
168
|
+
|
|
169
|
+
# Production lens
|
|
170
|
+
if production:
|
|
171
|
+
prod_issues = []
|
|
172
|
+
neg = production.get("negative_feedback_inputs", [])
|
|
173
|
+
if neg:
|
|
174
|
+
prod_issues.append(f"Users gave negative feedback on {len(neg)} queries")
|
|
175
|
+
errors = production.get("error_patterns", production.get("errors", []))
|
|
176
|
+
if errors and isinstance(errors, list) and len(errors) > 0:
|
|
177
|
+
prod_issues.append(f"Production errors: {str(errors[0])[:100]}")
|
|
178
|
+
slow = production.get("slow_queries", [])
|
|
179
|
+
if slow:
|
|
180
|
+
prod_issues.append(f"{len(slow)} slow queries detected")
|
|
181
|
+
if prod_issues:
|
|
182
|
+
lens_id += 1
|
|
183
|
+
lenses.append({
|
|
184
|
+
"id": lens_id,
|
|
185
|
+
"question": f"Production data shows: {'; '.join(prod_issues)}. How should the agent handle these real-world patterns?",
|
|
186
|
+
"source": "production",
|
|
187
|
+
"severity": "high",
|
|
188
|
+
"context": {},
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
# Evolution memory lens — winning patterns
|
|
192
|
+
if memory:
|
|
193
|
+
for insight in memory.get("insights", []):
|
|
194
|
+
if insight.get("type") == "strategy_effectiveness" and insight.get("recurrence", 0) >= 2:
|
|
195
|
+
lens_id += 1
|
|
196
|
+
lenses.append({
|
|
197
|
+
"id": lens_id,
|
|
198
|
+
"question": f"{insight['insight']} — what further improvements in this direction are possible?",
|
|
199
|
+
"source": "evolution_memory",
|
|
200
|
+
"severity": "medium",
|
|
201
|
+
"context": {"recurrence": insight["recurrence"]},
|
|
202
|
+
})
|
|
203
|
+
break # at most 1 memory lens
|
|
204
|
+
|
|
205
|
+
# Evolution memory lens — persistent failures
|
|
206
|
+
if memory:
|
|
207
|
+
for insight in memory.get("insights", []):
|
|
208
|
+
if insight.get("type") == "recurring_failure" and insight.get("recurrence", 0) >= 3:
|
|
209
|
+
lens_id += 1
|
|
210
|
+
lenses.append({
|
|
211
|
+
"id": lens_id,
|
|
212
|
+
"question": f"{insight['insight']} — this has persisted {insight['recurrence']} iterations. Why?",
|
|
213
|
+
"source": "persistent_failure",
|
|
214
|
+
"severity": "critical",
|
|
215
|
+
"context": {"recurrence": insight["recurrence"]},
|
|
216
|
+
})
|
|
217
|
+
break # at most 1 persistent failure lens
|
|
218
|
+
|
|
219
|
+
# Open lens (always included)
|
|
220
|
+
lens_id += 1
|
|
221
|
+
lenses.append({
|
|
222
|
+
"id": lens_id,
|
|
223
|
+
"question": "Open investigation — read all context and investigate what stands out most to you.",
|
|
224
|
+
"source": "open",
|
|
225
|
+
"severity": "medium",
|
|
226
|
+
"context": {},
|
|
227
|
+
})
|
|
228
|
+
|
|
229
|
+
# Sort by severity, take top max_lenses
|
|
230
|
+
severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
|
|
231
|
+
lenses.sort(key=lambda l: severity_order.get(l["severity"], 2))
|
|
232
|
+
lenses = lenses[:max_lenses]
|
|
233
|
+
|
|
234
|
+
# Reassign sequential IDs after sorting/truncating
|
|
235
|
+
for i, lens in enumerate(lenses):
|
|
236
|
+
lens["id"] = i + 1
|
|
237
|
+
|
|
238
|
+
return lenses
|
|
239
|
+
|
|
240
|
+
|
|
123
241
|
def format_strategy_md(strategy, config):
|
|
124
242
|
"""Format strategy as markdown document."""
|
|
125
243
|
lines = [
|
|
@@ -197,6 +315,7 @@ def main():
|
|
|
197
315
|
parser.add_argument("--evolution-memory", default="evolution_memory.json")
|
|
198
316
|
parser.add_argument("--production-seed", default="production_seed.json")
|
|
199
317
|
parser.add_argument("--output", default="strategy.md")
|
|
318
|
+
parser.add_argument("--lenses", default=None, help="Output path for lenses JSON")
|
|
200
319
|
args = parser.parse_args()
|
|
201
320
|
|
|
202
321
|
with open(args.config) as f:
|
|
@@ -217,6 +336,23 @@ def main():
|
|
|
217
336
|
with open(json_path, "w") as f:
|
|
218
337
|
json.dump(strategy, f, indent=2)
|
|
219
338
|
|
|
339
|
+
# Generate lenses if requested
|
|
340
|
+
if args.lenses:
|
|
341
|
+
max_proposers = config.get("max_proposers", 5)
|
|
342
|
+
lens_list = generate_lenses(
|
|
343
|
+
strategy, config, insights, results, memory, production,
|
|
344
|
+
max_lenses=max_proposers,
|
|
345
|
+
)
|
|
346
|
+
from datetime import datetime, timezone
|
|
347
|
+
lenses_output = {
|
|
348
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
349
|
+
"lens_count": len(lens_list),
|
|
350
|
+
"lenses": lens_list,
|
|
351
|
+
}
|
|
352
|
+
with open(args.lenses, "w") as f:
|
|
353
|
+
json.dump(lenses_output, f, indent=2)
|
|
354
|
+
print(f"Generated {len(lens_list)} lenses → {args.lenses}", file=sys.stderr)
|
|
355
|
+
|
|
220
356
|
print(md)
|
|
221
357
|
|
|
222
358
|
|
package/tools/trace_insights.py
CHANGED
|
@@ -335,10 +335,16 @@ def fetch_scores_from_experiment(experiment_name):
|
|
|
335
335
|
limit=200,
|
|
336
336
|
))
|
|
337
337
|
|
|
338
|
+
all_run_ids = [run.id for run in runs]
|
|
339
|
+
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
340
|
+
fb_map = {}
|
|
341
|
+
for fb in all_feedbacks:
|
|
342
|
+
fb_map.setdefault(str(fb.run_id), []).append(fb)
|
|
343
|
+
|
|
338
344
|
per_task = {}
|
|
339
345
|
for run in runs:
|
|
340
346
|
example_id = str(run.reference_example_id or run.id)
|
|
341
|
-
feedbacks =
|
|
347
|
+
feedbacks = fb_map.get(str(run.id), [])
|
|
342
348
|
scores = [fb.score for fb in feedbacks if fb.score is not None]
|
|
343
349
|
avg_score = sum(scores) / len(scores) if scores else 0.0
|
|
344
350
|
per_task[example_id] = {"score": avg_score}
|