claude-turing 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,696 @@
1
+ #!/usr/bin/env python3
2
+ """Automated ensemble construction for the autoresearch pipeline.
3
+
4
+ Builds ensembles from the top-K experiments automatically. Tries voting,
5
+ weighted voting, stacking, and blending. Often yields 1-3% improvement
6
+ from models already trained — zero additional training cost.
7
+
8
+ Usage:
9
+ python scripts/build_ensemble.py
10
+ python scripts/build_ensemble.py --top-k 5
11
+ python scripts/build_ensemble.py --methods voting,stacking
12
+ python scripts/build_ensemble.py --json
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import hashlib
19
+ import json
20
+ import sys
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+
24
+ import numpy as np
25
+ import yaml
26
+
27
+ from scripts.turing_io import load_config, load_experiments
28
+
29
+ DEFAULT_LOG_PATH = "experiments/log.jsonl"
30
+ DEFAULT_TOP_K = 5
31
+ DEFAULT_METHODS = ["voting", "weighted_voting", "stacking", "blending"]
32
+ BLEND_HOLDOUT_RATIO = 0.3
33
+ MIN_DIVERSITY_THRESHOLD = 0.95 # Correlation above this = too similar
34
+
35
+
36
+ # --- Model Selection ---
37
+
38
+
39
+ def select_top_k(
40
+ experiments: list[dict],
41
+ primary_metric: str,
42
+ k: int,
43
+ lower_is_better: bool = False,
44
+ ) -> list[dict]:
45
+ """Select top-K experiments by primary metric.
46
+
47
+ Args:
48
+ experiments: All experiments.
49
+ primary_metric: Metric to rank by.
50
+ k: Number of top experiments.
51
+ lower_is_better: Whether lower metric is better.
52
+
53
+ Returns:
54
+ Top-K experiments sorted by metric.
55
+ """
56
+ kept = [e for e in experiments if e.get("status") == "kept"]
57
+ if not kept:
58
+ kept = [e for e in experiments if primary_metric in e.get("metrics", {})]
59
+
60
+ with_metric = [
61
+ e for e in kept
62
+ if isinstance(e.get("metrics", {}).get(primary_metric), (int, float))
63
+ ]
64
+
65
+ with_metric.sort(
66
+ key=lambda e: e["metrics"][primary_metric],
67
+ reverse=not lower_is_better,
68
+ )
69
+
70
+ return with_metric[:k]
71
+
72
+
73
+ def compute_prediction_correlation(predictions: list[np.ndarray]) -> np.ndarray:
74
+ """Compute pairwise correlation matrix of model predictions.
75
+
76
+ Args:
77
+ predictions: List of prediction arrays, one per model.
78
+
79
+ Returns:
80
+ NxN correlation matrix.
81
+ """
82
+ n = len(predictions)
83
+ if n < 2:
84
+ return np.eye(n)
85
+
86
+ corr = np.eye(n)
87
+ for i in range(n):
88
+ for j in range(i + 1, n):
89
+ if len(predictions[i]) == len(predictions[j]) and len(predictions[i]) > 0:
90
+ c = np.corrcoef(predictions[i].ravel(), predictions[j].ravel())[0, 1]
91
+ if np.isnan(c):
92
+ c = 0.0
93
+ corr[i, j] = c
94
+ corr[j, i] = c
95
+
96
+ return corr
97
+
98
+
99
+ def filter_diverse_models(
100
+ experiments: list[dict],
101
+ predictions: list[np.ndarray] | None,
102
+ threshold: float = MIN_DIVERSITY_THRESHOLD,
103
+ ) -> tuple[list[dict], list[int]]:
104
+ """Filter out models with highly correlated predictions.
105
+
106
+ Args:
107
+ experiments: Candidate experiments.
108
+ predictions: Prediction arrays (same order as experiments).
109
+ threshold: Max correlation to keep both models.
110
+
111
+ Returns:
112
+ (filtered_experiments, kept_indices)
113
+ """
114
+ if predictions is None or len(predictions) < 2:
115
+ return experiments, list(range(len(experiments)))
116
+
117
+ corr = compute_prediction_correlation(predictions)
118
+ n = len(experiments)
119
+ kept = [True] * n
120
+
121
+ for i in range(n):
122
+ if not kept[i]:
123
+ continue
124
+ for j in range(i + 1, n):
125
+ if not kept[j]:
126
+ continue
127
+ if abs(corr[i, j]) > threshold:
128
+ # Drop the worse model
129
+ metric_i = next(iter(experiments[i].get("metrics", {}).values()), 0)
130
+ metric_j = next(iter(experiments[j].get("metrics", {}).values()), 0)
131
+ if metric_j >= metric_i:
132
+ kept[i] = False
133
+ else:
134
+ kept[j] = False
135
+
136
+ indices = [i for i in range(n) if kept[i]]
137
+ filtered = [experiments[i] for i in indices]
138
+ return filtered, indices
139
+
140
+
141
+ # --- Ensemble Methods ---
142
+
143
+
144
+ def voting_ensemble(
145
+ predictions: list[np.ndarray],
146
+ task_type: str = "classification",
147
+ ) -> np.ndarray:
148
+ """Uniform voting ensemble.
149
+
150
+ Classification: majority vote.
151
+ Regression: simple mean.
152
+ """
153
+ if not predictions:
154
+ return np.array([])
155
+
156
+ stacked = np.stack(predictions)
157
+
158
+ if task_type == "classification":
159
+ # Majority vote (assumes integer class labels)
160
+ from scipy import stats as scipy_stats
161
+ result, _ = scipy_stats.mode(stacked, axis=0, keepdims=False)
162
+ return result.ravel()
163
+ else:
164
+ return np.mean(stacked, axis=0)
165
+
166
+
167
+ def weighted_voting_ensemble(
168
+ predictions: list[np.ndarray],
169
+ weights: list[float],
170
+ task_type: str = "classification",
171
+ ) -> np.ndarray:
172
+ """Weighted voting/averaging ensemble.
173
+
174
+ Classification: weighted majority vote.
175
+ Regression: weighted mean.
176
+ """
177
+ if not predictions or not weights:
178
+ return np.array([])
179
+
180
+ w = np.array(weights)
181
+ w = w / w.sum() # Normalize
182
+
183
+ if task_type == "classification":
184
+ # Weighted vote: accumulate votes per class
185
+ stacked = np.stack(predictions)
186
+ n_samples = stacked.shape[1] if stacked.ndim > 1 else len(stacked[0])
187
+ result = np.zeros(n_samples)
188
+ classes = np.unique(stacked)
189
+ for idx in range(n_samples):
190
+ class_votes = {}
191
+ for m, pred in enumerate(predictions):
192
+ val = pred[idx] if idx < len(pred) else 0
193
+ class_votes[val] = class_votes.get(val, 0) + w[m]
194
+ result[idx] = max(class_votes, key=class_votes.get)
195
+ return result
196
+ else:
197
+ stacked = np.stack(predictions)
198
+ return np.average(stacked, axis=0, weights=w)
199
+
200
+
201
+ def stacking_ensemble(
202
+ predictions: list[np.ndarray],
203
+ labels: np.ndarray,
204
+ task_type: str = "classification",
205
+ n_folds: int = 5,
206
+ ) -> dict:
207
+ """Stacking ensemble with cross-validated meta-learner.
208
+
209
+ Trains a logistic regression (classification) or ridge regression
210
+ on out-of-fold predictions from base models.
211
+
212
+ Returns:
213
+ Dict with meta_predictions, meta_weights, meta_model_type.
214
+ """
215
+ if not predictions or len(labels) == 0:
216
+ return {"meta_predictions": np.array([]), "meta_weights": [], "meta_model_type": "none"}
217
+
218
+ # Build meta-features: NxM matrix (N samples, M models)
219
+ X_meta = np.column_stack(predictions)
220
+ y = labels
221
+
222
+ n_samples = len(y)
223
+ if n_samples < n_folds:
224
+ n_folds = max(2, n_samples)
225
+
226
+ fold_size = n_samples // n_folds
227
+ oof_predictions = np.zeros(n_samples)
228
+ meta_weights = []
229
+
230
+ for fold in range(n_folds):
231
+ start = fold * fold_size
232
+ end = start + fold_size if fold < n_folds - 1 else n_samples
233
+
234
+ val_idx = list(range(start, end))
235
+ train_idx = [i for i in range(n_samples) if i not in val_idx]
236
+
237
+ X_train, X_val = X_meta[train_idx], X_meta[val_idx]
238
+ y_train = y[train_idx]
239
+
240
+ if task_type == "classification":
241
+ # Simple logistic regression via closed-form softmax regression
242
+ # Using sklearn-like approach but manual for minimal deps
243
+ weights = _fit_linear_meta(X_train, y_train, regularize=True)
244
+ else:
245
+ weights = _fit_linear_meta(X_train, y_train, regularize=True)
246
+
247
+ oof_predictions[val_idx] = X_val @ weights
248
+ meta_weights.append(weights)
249
+
250
+ # Average weights across folds
251
+ avg_weights = np.mean(meta_weights, axis=0)
252
+
253
+ if task_type == "classification":
254
+ oof_predictions = np.round(oof_predictions).astype(int)
255
+
256
+ return {
257
+ "meta_predictions": oof_predictions,
258
+ "meta_weights": avg_weights.tolist(),
259
+ "meta_model_type": "ridge" if task_type == "regression" else "logistic",
260
+ }
261
+
262
+
263
+ def blending_ensemble(
264
+ predictions: list[np.ndarray],
265
+ labels: np.ndarray,
266
+ task_type: str = "classification",
267
+ holdout_ratio: float = BLEND_HOLDOUT_RATIO,
268
+ ) -> dict:
269
+ """Blending ensemble using holdout set for meta-learner.
270
+
271
+ Simpler than stacking (no cross-validation), but less data-efficient.
272
+
273
+ Returns:
274
+ Dict with meta_predictions, meta_weights, holdout_size.
275
+ """
276
+ if not predictions or len(labels) == 0:
277
+ return {"meta_predictions": np.array([]), "meta_weights": [], "holdout_size": 0}
278
+
279
+ X_meta = np.column_stack(predictions)
280
+ y = labels
281
+
282
+ n_samples = len(y)
283
+ split = int(n_samples * (1 - holdout_ratio))
284
+ if split < 2 or n_samples - split < 2:
285
+ return {"meta_predictions": np.array([]), "meta_weights": [], "holdout_size": 0}
286
+
287
+ X_train, X_val = X_meta[:split], X_meta[split:]
288
+ y_train, y_val = y[:split], y[split:]
289
+
290
+ weights = _fit_linear_meta(X_train, y_train, regularize=True)
291
+ blend_predictions = X_val @ weights
292
+
293
+ if task_type == "classification":
294
+ blend_predictions = np.round(blend_predictions).astype(int)
295
+
296
+ return {
297
+ "meta_predictions": blend_predictions,
298
+ "meta_weights": weights.tolist(),
299
+ "holdout_size": n_samples - split,
300
+ "holdout_labels": y_val,
301
+ }
302
+
303
+
304
+ def _fit_linear_meta(X: np.ndarray, y: np.ndarray, regularize: bool = True) -> np.ndarray:
305
+ """Fit a linear meta-learner (ridge regression).
306
+
307
+ Returns weight vector of shape (n_models,).
308
+ """
309
+ n_features = X.shape[1]
310
+ alpha = 1.0 if regularize else 0.0
311
+
312
+ # Ridge: w = (X^T X + alpha I)^-1 X^T y
313
+ XtX = X.T @ X + alpha * np.eye(n_features)
314
+ Xty = X.T @ y
315
+
316
+ try:
317
+ weights = np.linalg.solve(XtX, Xty)
318
+ except np.linalg.LinAlgError:
319
+ weights = np.ones(n_features) / n_features
320
+
321
+ return weights
322
+
323
+
324
+ # --- Evaluation ---
325
+
326
+
327
+ def evaluate_ensemble(
328
+ predictions: np.ndarray,
329
+ labels: np.ndarray,
330
+ task_type: str = "classification",
331
+ ) -> dict:
332
+ """Evaluate ensemble predictions against ground truth.
333
+
334
+ Returns dict with accuracy (classification) or mse/rmse (regression).
335
+ """
336
+ if len(predictions) == 0 or len(labels) == 0:
337
+ return {}
338
+
339
+ min_len = min(len(predictions), len(labels))
340
+ predictions = predictions[:min_len]
341
+ labels = labels[:min_len]
342
+
343
+ if task_type == "classification":
344
+ correct = np.sum(predictions == labels)
345
+ return {
346
+ "accuracy": round(float(correct / min_len), 6),
347
+ "n_samples": min_len,
348
+ }
349
+ else:
350
+ mse = float(np.mean((predictions - labels) ** 2))
351
+ return {
352
+ "mse": round(mse, 6),
353
+ "rmse": round(float(np.sqrt(mse)), 6),
354
+ "n_samples": min_len,
355
+ }
356
+
357
+
358
+ # --- Full Ensemble Pipeline ---
359
+
360
+
361
+ def build_ensemble(
362
+ top_k: int = DEFAULT_TOP_K,
363
+ methods: list[str] | None = None,
364
+ config_path: str = "config.yaml",
365
+ log_path: str = DEFAULT_LOG_PATH,
366
+ predictions_dir: str = "experiments/predictions",
367
+ ) -> dict:
368
+ """Build and evaluate ensembles from top-K experiments.
369
+
370
+ Args:
371
+ top_k: Number of top models to consider.
372
+ methods: Ensemble methods to try.
373
+ config_path: Path to config.yaml.
374
+ log_path: Path to experiment log.
375
+ predictions_dir: Directory containing saved predictions.
376
+
377
+ Returns:
378
+ Complete ensemble report.
379
+ """
380
+ if methods is None:
381
+ methods = DEFAULT_METHODS
382
+
383
+ config = load_config(config_path)
384
+ eval_cfg = config.get("evaluation", {})
385
+ primary_metric = eval_cfg.get("primary_metric", "accuracy")
386
+ lower_is_better = eval_cfg.get("lower_is_better", False)
387
+ task_type = config.get("task", {}).get("type", "classification")
388
+
389
+ experiments = load_experiments(log_path)
390
+ candidates = select_top_k(experiments, primary_metric, top_k, lower_is_better)
391
+
392
+ if not candidates:
393
+ return {"error": f"No experiments with {primary_metric} found in {log_path}"}
394
+
395
+ if len(candidates) < 2:
396
+ return {"error": "Need at least 2 experiments for ensemble building"}
397
+
398
+ # Load predictions if available
399
+ predictions = _load_predictions(candidates, predictions_dir)
400
+ labels = _load_labels(predictions_dir)
401
+
402
+ # Diversity analysis
403
+ diversity = {}
404
+ if predictions:
405
+ corr_matrix = compute_prediction_correlation(predictions)
406
+ diversity = {
407
+ "correlation_matrix": corr_matrix.tolist(),
408
+ "mean_correlation": float(np.mean(corr_matrix[np.triu_indices_from(corr_matrix, k=1)])) if len(corr_matrix) > 1 else 0.0,
409
+ "model_ids": [e.get("experiment_id", "?") for e in candidates],
410
+ }
411
+
412
+ # Best single model baseline
413
+ best_single = candidates[0]
414
+ best_metric = best_single.get("metrics", {}).get(primary_metric, 0)
415
+
416
+ # Try each ensemble method
417
+ results = []
418
+ results.append({
419
+ "method": "best_single",
420
+ "metric_value": best_metric,
421
+ "delta": 0.0,
422
+ "experiment_id": best_single.get("experiment_id"),
423
+ })
424
+
425
+ if predictions and labels is not None:
426
+ weights = [
427
+ e.get("metrics", {}).get(primary_metric, 0)
428
+ for e in candidates[:len(predictions)]
429
+ ]
430
+
431
+ for method in methods:
432
+ result = _try_method(
433
+ method, predictions, labels, weights, task_type, primary_metric,
434
+ )
435
+ if result:
436
+ result["delta"] = round(result.get("metric_value", 0) - best_metric, 6)
437
+ results.append(result)
438
+
439
+ # Find best ensemble
440
+ if lower_is_better:
441
+ best_result = min(results, key=lambda r: r.get("metric_value", float("inf")))
442
+ else:
443
+ best_result = max(results, key=lambda r: r.get("metric_value", float("-inf")))
444
+
445
+ report = {
446
+ "generated_at": datetime.now(timezone.utc).isoformat(),
447
+ "primary_metric": primary_metric,
448
+ "task_type": task_type,
449
+ "n_candidates": len(candidates),
450
+ "base_models": [
451
+ {
452
+ "experiment_id": e.get("experiment_id"),
453
+ "model_type": e.get("config", {}).get("model_type", "?"),
454
+ primary_metric: e.get("metrics", {}).get(primary_metric),
455
+ }
456
+ for e in candidates
457
+ ],
458
+ "results": results,
459
+ "best_method": best_result.get("method"),
460
+ "best_metric": best_result.get("metric_value"),
461
+ "improvement": best_result.get("delta", 0),
462
+ "diversity": diversity,
463
+ }
464
+
465
+ return report
466
+
467
+
468
+ def _try_method(
469
+ method: str,
470
+ predictions: list[np.ndarray],
471
+ labels: np.ndarray,
472
+ weights: list[float],
473
+ task_type: str,
474
+ primary_metric: str,
475
+ ) -> dict | None:
476
+ """Try a single ensemble method and return results."""
477
+ try:
478
+ if method == "voting":
479
+ preds = voting_ensemble(predictions, task_type)
480
+ eval_result = evaluate_ensemble(preds, labels, task_type)
481
+ metric_val = eval_result.get("accuracy", eval_result.get("rmse", 0))
482
+ return {"method": "voting", "metric_value": metric_val, "details": eval_result}
483
+
484
+ elif method == "weighted_voting":
485
+ preds = weighted_voting_ensemble(predictions, weights, task_type)
486
+ eval_result = evaluate_ensemble(preds, labels, task_type)
487
+ metric_val = eval_result.get("accuracy", eval_result.get("rmse", 0))
488
+ return {"method": "weighted_voting", "metric_value": metric_val, "details": eval_result, "weights": [round(w / sum(weights), 4) for w in weights]}
489
+
490
+ elif method == "stacking":
491
+ result = stacking_ensemble(predictions, labels, task_type)
492
+ if len(result["meta_predictions"]) > 0:
493
+ eval_result = evaluate_ensemble(result["meta_predictions"], labels, task_type)
494
+ metric_val = eval_result.get("accuracy", eval_result.get("rmse", 0))
495
+ return {"method": "stacking", "metric_value": metric_val, "details": eval_result, "meta_weights": result["meta_weights"]}
496
+
497
+ elif method == "blending":
498
+ result = blending_ensemble(predictions, labels, task_type)
499
+ if len(result["meta_predictions"]) > 0 and result.get("holdout_labels") is not None:
500
+ eval_result = evaluate_ensemble(result["meta_predictions"], result["holdout_labels"], task_type)
501
+ metric_val = eval_result.get("accuracy", eval_result.get("rmse", 0))
502
+ return {"method": "blending", "metric_value": metric_val, "details": eval_result, "holdout_size": result["holdout_size"]}
503
+
504
+ except Exception:
505
+ pass
506
+
507
+ return None
508
+
509
+
510
+ def _load_predictions(
511
+ experiments: list[dict],
512
+ predictions_dir: str,
513
+ ) -> list[np.ndarray]:
514
+ """Load saved predictions for experiments."""
515
+ preds_path = Path(predictions_dir)
516
+ predictions = []
517
+
518
+ for exp in experiments:
519
+ exp_id = exp.get("experiment_id", "")
520
+ pred_file = preds_path / f"{exp_id}-predictions.npy"
521
+ if pred_file.exists():
522
+ predictions.append(np.load(pred_file))
523
+ else:
524
+ # Try CSV fallback
525
+ csv_file = preds_path / f"{exp_id}-predictions.csv"
526
+ if csv_file.exists():
527
+ predictions.append(np.loadtxt(csv_file, delimiter=","))
528
+
529
+ return predictions
530
+
531
+
532
+ def _load_labels(predictions_dir: str) -> np.ndarray | None:
533
+ """Load ground truth labels."""
534
+ preds_path = Path(predictions_dir)
535
+ labels_file = preds_path / "labels.npy"
536
+ if labels_file.exists():
537
+ return np.load(labels_file)
538
+ csv_file = preds_path / "labels.csv"
539
+ if csv_file.exists():
540
+ return np.loadtxt(csv_file, delimiter=",")
541
+ return None
542
+
543
+
544
+ # --- Report Formatting ---
545
+
546
+
547
+ def save_ensemble_report(report: dict, output_dir: str = "experiments/ensembles") -> Path:
548
+ """Save ensemble report to YAML."""
549
+ out_path = Path(output_dir)
550
+ out_path.mkdir(parents=True, exist_ok=True)
551
+
552
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
553
+ filepath = out_path / f"ensemble-{timestamp}.yaml"
554
+
555
+ # Convert numpy types for YAML serialization
556
+ clean = json.loads(json.dumps(report, default=str))
557
+ with open(filepath, "w") as f:
558
+ yaml.dump(clean, f, default_flow_style=False, sort_keys=False)
559
+
560
+ return filepath
561
+
562
+
563
+ def format_ensemble_report(report: dict) -> str:
564
+ """Format ensemble report as human-readable markdown."""
565
+ if "error" in report:
566
+ return f"ERROR: {report['error']}"
567
+
568
+ lines = [
569
+ "# Ensemble Results",
570
+ "",
571
+ f"*Generated {report.get('generated_at', 'N/A')[:19]}*",
572
+ "",
573
+ f"**Task type:** {report.get('task_type', '?')}",
574
+ f"**Primary metric:** {report.get('primary_metric', '?')}",
575
+ f"**Base models:** {report.get('n_candidates', 0)}",
576
+ "",
577
+ ]
578
+
579
+ # Base models
580
+ lines.extend(["## Base Models", ""])
581
+ base = report.get("base_models", [])
582
+ if base:
583
+ metric = report.get("primary_metric", "metric")
584
+ lines.append(f"| Experiment | Model Type | {metric} |")
585
+ lines.append("|------------|------------|--------|")
586
+ for m in base:
587
+ val = m.get(metric, "N/A")
588
+ val_str = f"{val:.4f}" if isinstance(val, float) else str(val)
589
+ lines.append(f"| {m.get('experiment_id', '?')} | {m.get('model_type', '?')} | {val_str} |")
590
+ lines.append("")
591
+
592
+ # Results table
593
+ results = report.get("results", [])
594
+ if results:
595
+ metric = report.get("primary_metric", "metric")
596
+ lines.extend(["## Ensemble Comparison", ""])
597
+ lines.append(f"| Method | {metric} | Delta vs Best Single |")
598
+ lines.append("|--------|--------|---------------------|")
599
+ best_method = report.get("best_method")
600
+ for r in results:
601
+ val = r.get("metric_value", "N/A")
602
+ val_str = f"{val:.4f}" if isinstance(val, (int, float)) else str(val)
603
+ delta = r.get("delta", 0)
604
+ delta_str = f"{delta:+.4f}" if isinstance(delta, (int, float)) else "—"
605
+ marker = " BEST" if r.get("method") == best_method and r.get("method") != "best_single" else ""
606
+ lines.append(f"| {r.get('method', '?')} | {val_str} | {delta_str} |{marker}")
607
+ lines.append("")
608
+
609
+ # Improvement summary
610
+ improvement = report.get("improvement", 0)
611
+ best = report.get("best_method", "best_single")
612
+ if best != "best_single" and improvement > 0:
613
+ lines.extend([
614
+ "## Summary",
615
+ "",
616
+ f"**Best ensemble ({best}) improves over best single model by {improvement:+.4f}**",
617
+ "",
618
+ ])
619
+ elif best == "best_single":
620
+ lines.extend([
621
+ "## Summary",
622
+ "",
623
+ "No ensemble method improved over the best single model.",
624
+ "Consider training more diverse models before ensembling.",
625
+ "",
626
+ ])
627
+
628
+ # Diversity
629
+ diversity = report.get("diversity", {})
630
+ if diversity.get("mean_correlation") is not None:
631
+ lines.extend([
632
+ "## Diversity Analysis",
633
+ "",
634
+ f"**Mean prediction correlation:** {diversity['mean_correlation']:.3f}",
635
+ ])
636
+ if diversity["mean_correlation"] > 0.9:
637
+ lines.append("*High correlation — models are very similar. Diversity would help.*")
638
+ elif diversity["mean_correlation"] < 0.5:
639
+ lines.append("*Good diversity — models complement each other well.*")
640
+ lines.append("")
641
+
642
+ return "\n".join(lines)
643
+
644
+
645
+ def main() -> None:
646
+ """CLI entry point."""
647
+ parser = argparse.ArgumentParser(
648
+ description="Automated ensemble construction",
649
+ )
650
+ parser.add_argument(
651
+ "--top-k", type=int, default=DEFAULT_TOP_K,
652
+ help=f"Number of top models to include (default: {DEFAULT_TOP_K})",
653
+ )
654
+ parser.add_argument(
655
+ "--methods", default=",".join(DEFAULT_METHODS),
656
+ help=f"Ensemble methods to try (default: {','.join(DEFAULT_METHODS)})",
657
+ )
658
+ parser.add_argument(
659
+ "--config", default="config.yaml",
660
+ help="Path to config.yaml",
661
+ )
662
+ parser.add_argument(
663
+ "--log", default=DEFAULT_LOG_PATH,
664
+ help="Path to experiment log",
665
+ )
666
+ parser.add_argument(
667
+ "--predictions-dir", default="experiments/predictions",
668
+ help="Directory containing saved predictions",
669
+ )
670
+ parser.add_argument(
671
+ "--json", action="store_true",
672
+ help="Output raw JSON instead of formatted report",
673
+ )
674
+ args = parser.parse_args()
675
+
676
+ methods = [m.strip() for m in args.methods.split(",")]
677
+ report = build_ensemble(
678
+ top_k=args.top_k,
679
+ methods=methods,
680
+ config_path=args.config,
681
+ log_path=args.log,
682
+ predictions_dir=args.predictions_dir,
683
+ )
684
+
685
+ if "error" not in report:
686
+ filepath = save_ensemble_report(report)
687
+ print(f"Saved to {filepath}", file=sys.stderr)
688
+
689
+ if args.json:
690
+ print(json.dumps(report, indent=2, default=str))
691
+ else:
692
+ print(format_ensemble_report(report))
693
+
694
+
695
+ if __name__ == "__main__":
696
+ main()