harness-evolver 4.0.3 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,15 @@
1
1
  #!/usr/bin/env python3
2
- """Synthesize evolution strategy document from trace analysis.
2
+ """Synthesize evolution strategy document and investigation lenses.
3
3
 
4
4
  Reads trace_insights.json, best_results.json, evolution_memory.json,
5
5
  and production_seed.json to produce a targeted strategy document with
6
6
  specific file paths and concrete change recommendations for proposers.
7
7
 
8
+ When --lenses is specified, also generates a lenses.json file containing
9
+ investigation questions derived from failure clusters, architecture issues,
10
+ production data, and evolution memory. Each lens becomes a focused brief
11
+ for one proposer agent.
12
+
8
13
  Usage:
9
14
  python3 synthesize_strategy.py \
10
15
  --config .evolver.json \
@@ -12,7 +17,8 @@ Usage:
12
17
  --best-results best_results.json \
13
18
  --evolution-memory evolution_memory.json \
14
19
  --production-seed production_seed.json \
15
- --output strategy.md
20
+ --output strategy.md \
21
+ --lenses lenses.json
16
22
  """
17
23
 
18
24
  import argparse
@@ -120,6 +126,118 @@ def synthesize(config, insights, results, memory, production=None):
120
126
  return strategy
121
127
 
122
128
 
129
+ def generate_lenses(strategy, config, insights, results, memory, production, max_lenses=5):
130
+ """Generate investigation lenses from available data sources."""
131
+ lenses = []
132
+ lens_id = 0
133
+
134
+ # Failure cluster lenses (one per distinct cluster, max 3)
135
+ for cluster in strategy.get("failure_clusters", [])[:3]:
136
+ lens_id += 1
137
+ desc = cluster["description"]
138
+ severity = cluster["severity"]
139
+ examples = []
140
+ for ex in strategy.get("failing_examples", []):
141
+ if ex.get("error") and cluster.get("type", "") in str(ex.get("error", "")):
142
+ examples.append(ex["example_id"])
143
+ if not examples:
144
+ examples = [ex["example_id"] for ex in strategy.get("failing_examples", [])[:3]]
145
+ lenses.append({
146
+ "id": lens_id,
147
+ "question": f"{desc} — what code change would fix this?",
148
+ "source": "failure_cluster",
149
+ "severity": severity,
150
+ "context": {"examples": examples[:5]},
151
+ })
152
+
153
+ # Architecture lens from trace insights
154
+ if insights:
155
+ for issue in insights.get("top_issues", []):
156
+ if issue.get("severity") == "high" and issue.get("type") in (
157
+ "architecture", "routing", "topology", "structure",
158
+ ):
159
+ lens_id += 1
160
+ lenses.append({
161
+ "id": lens_id,
162
+ "question": f"Architectural issue: {issue['description']} — what structural change would help?",
163
+ "source": "architecture",
164
+ "severity": "high",
165
+ "context": {"issue_type": issue["type"]},
166
+ })
167
+ break # at most 1 architecture lens
168
+
169
+ # Production lens
170
+ if production:
171
+ prod_issues = []
172
+ neg = production.get("negative_feedback_inputs", [])
173
+ if neg:
174
+ prod_issues.append(f"Users gave negative feedback on {len(neg)} queries")
175
+ errors = production.get("error_patterns", production.get("errors", []))
176
+ if errors and isinstance(errors, list) and len(errors) > 0:
177
+ prod_issues.append(f"Production errors: {str(errors[0])[:100]}")
178
+ slow = production.get("slow_queries", [])
179
+ if slow:
180
+ prod_issues.append(f"{len(slow)} slow queries detected")
181
+ if prod_issues:
182
+ lens_id += 1
183
+ lenses.append({
184
+ "id": lens_id,
185
+ "question": f"Production data shows: {'; '.join(prod_issues)}. How should the agent handle these real-world patterns?",
186
+ "source": "production",
187
+ "severity": "high",
188
+ "context": {},
189
+ })
190
+
191
+ # Evolution memory lens — winning patterns
192
+ if memory:
193
+ for insight in memory.get("insights", []):
194
+ if insight.get("type") == "strategy_effectiveness" and insight.get("recurrence", 0) >= 2:
195
+ lens_id += 1
196
+ lenses.append({
197
+ "id": lens_id,
198
+ "question": f"{insight['insight']} — what further improvements in this direction are possible?",
199
+ "source": "evolution_memory",
200
+ "severity": "medium",
201
+ "context": {"recurrence": insight["recurrence"]},
202
+ })
203
+ break # at most 1 memory lens
204
+
205
+ # Evolution memory lens — persistent failures
206
+ if memory:
207
+ for insight in memory.get("insights", []):
208
+ if insight.get("type") == "recurring_failure" and insight.get("recurrence", 0) >= 3:
209
+ lens_id += 1
210
+ lenses.append({
211
+ "id": lens_id,
212
+ "question": f"{insight['insight']} — this has persisted {insight['recurrence']} iterations. Why?",
213
+ "source": "persistent_failure",
214
+ "severity": "critical",
215
+ "context": {"recurrence": insight["recurrence"]},
216
+ })
217
+ break # at most 1 persistent failure lens
218
+
219
+ # Open lens (always included)
220
+ lens_id += 1
221
+ lenses.append({
222
+ "id": lens_id,
223
+ "question": "Open investigation — read all context and investigate what stands out most to you.",
224
+ "source": "open",
225
+ "severity": "medium",
226
+ "context": {},
227
+ })
228
+
229
+ # Sort by severity, take top max_lenses
230
+ severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
231
+ lenses.sort(key=lambda l: severity_order.get(l["severity"], 2))
232
+ lenses = lenses[:max_lenses]
233
+
234
+ # Reassign sequential IDs after sorting/truncating
235
+ for i, lens in enumerate(lenses):
236
+ lens["id"] = i + 1
237
+
238
+ return lenses
239
+
240
+
123
241
  def format_strategy_md(strategy, config):
124
242
  """Format strategy as markdown document."""
125
243
  lines = [
@@ -197,6 +315,7 @@ def main():
197
315
  parser.add_argument("--evolution-memory", default="evolution_memory.json")
198
316
  parser.add_argument("--production-seed", default="production_seed.json")
199
317
  parser.add_argument("--output", default="strategy.md")
318
+ parser.add_argument("--lenses", default=None, help="Output path for lenses JSON")
200
319
  args = parser.parse_args()
201
320
 
202
321
  with open(args.config) as f:
@@ -217,6 +336,23 @@ def main():
217
336
  with open(json_path, "w") as f:
218
337
  json.dump(strategy, f, indent=2)
219
338
 
339
+ # Generate lenses if requested
340
+ if args.lenses:
341
+ max_proposers = config.get("max_proposers", 5)
342
+ lens_list = generate_lenses(
343
+ strategy, config, insights, results, memory, production,
344
+ max_lenses=max_proposers,
345
+ )
346
+ from datetime import datetime, timezone
347
+ lenses_output = {
348
+ "generated_at": datetime.now(timezone.utc).isoformat(),
349
+ "lens_count": len(lens_list),
350
+ "lenses": lens_list,
351
+ }
352
+ with open(args.lenses, "w") as f:
353
+ json.dump(lenses_output, f, indent=2)
354
+ print(f"Generated {len(lens_list)} lenses → {args.lenses}", file=sys.stderr)
355
+
220
356
  print(md)
221
357
 
222
358
 
@@ -335,10 +335,16 @@ def fetch_scores_from_experiment(experiment_name):
335
335
  limit=200,
336
336
  ))
337
337
 
338
+ all_run_ids = [run.id for run in runs]
339
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
340
+ fb_map = {}
341
+ for fb in all_feedbacks:
342
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
343
+
338
344
  per_task = {}
339
345
  for run in runs:
340
346
  example_id = str(run.reference_example_id or run.id)
341
- feedbacks = list(client.list_feedback(run_ids=[run.id]))
347
+ feedbacks = fb_map.get(str(run.id), [])
342
348
  scores = [fb.score for fb in feedbacks if fb.score is not None]
343
349
  avg_score = sum(scores) / len(scores) if scores else 0.0
344
350
  per_task[example_id] = {"score": avg_score}