PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl - Mend

wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1720) hide show

wisent/core/cli/optimize_steering.py ADDED Viewed

@@ -0,0 +1,3421 @@
+"""Steering optimization command execution logic with full strategy optimization.
+Results are persisted to ~/.wisent/configs/ via WisentConfigManager
+so they can be automatically loaded on subsequent runs.
+Supports two search strategies:
+- grid: Exhaustive search over all configurations (thorough but slow)
+- optuna: TPE sampling with early stopping (fast but may miss optimal)
+"""
+import json
+import sys
+import time
+import numpy as np
+from wisent.core.evaluators.rotator import EvaluatorRotator
+from wisent.core.models.inference_config import get_generate_kwargs
+from wisent.core.config_manager import (
+    get_config_manager,
+    save_steering_config,
+    get_steering_config,
+    get_cached_optimization,
+    store_optimization,
+    SteeringConfig,
+)
+def _run_optuna_search_for_task(
+    model,
+    train_pairs,
+    test_pairs,
+    evaluator,
+    task_name,
+    search_space,
+    args,
+    baseline_results=None,
+):
+    """
+    Run Optuna-based hyperparameter search for a single task.
+    Returns:
+        dict: Best configuration found with score and parameters
+    """
+    import optuna
+    from optuna.samplers import TPESampler
+    from optuna.pruners import MedianPruner
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.models.core.atoms import SteeringPlan
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    n_trials = getattr(args, 'n_trials', 50)
+    n_startup_trials = getattr(args, 'n_startup_trials', 10)
+    # Maps for converting string values to enums
+    token_agg_map = {
+        "last_token": ActivationAggregationStrategy.LAST_TOKEN,
+        "mean_pooling": ActivationAggregationStrategy.MEAN_POOLING,
+        "first_token": ActivationAggregationStrategy.FIRST_TOKEN,
+        "max_pooling": ActivationAggregationStrategy.MAX_POOLING,
+    }
+    def objective(trial):
+        """Optuna objective function for steering optimization."""
+        # Sample hyperparameters
+        layer = trial.suggest_int("layer", min(search_space.layers), max(search_space.layers))
+        strength = trial.suggest_float("strength", min(search_space.strengths), max(search_space.strengths), log=True)
+        strategy = trial.suggest_categorical("strategy", search_space.strategies)
+        token_agg_name = trial.suggest_categorical("token_aggregation", search_space.token_aggregations)
+        token_agg = token_agg_map.get(token_agg_name, ActivationAggregationStrategy.LAST_TOKEN)
+        layer_str = str(layer)
+        try:
+            # Collect activations
+            collector = ActivationCollector(model=model, store_device="cpu")
+            pos_acts = []
+            neg_acts = []
+            for pair in train_pairs.pairs:
+                updated_pair = collector.collect_for_pair(
+                    pair,
+                    layers=[layer_str],
+                    aggregation=token_agg,
+                    return_full_sequence=False,
+                    normalize_layers=False,
+                )
+                if (updated_pair.positive_response.layers_activations
+                    and layer_str in updated_pair.positive_response.layers_activations):
+                    act = updated_pair.positive_response.layers_activations[layer_str]
+                    if act is not None:
+                        pos_acts.append(act)
+                if (updated_pair.negative_response.layers_activations
+                    and layer_str in updated_pair.negative_response.layers_activations):
+                    act = updated_pair.negative_response.layers_activations[layer_str]
+                    if act is not None:
+                        neg_acts.append(act)
+            if len(pos_acts) == 0 or len(neg_acts) == 0:
+                return 0.0
+            # Train steering vector
+            method_name = args.methods[0] if args.methods else "CAA"
+            steering_method = create_steering_method(method_name, args)
+            import torch
+            pos_tensor = torch.stack(pos_acts).mean(dim=0)
+            neg_tensor = torch.stack(neg_acts).mean(dim=0)
+            steering_vector = steering_method.train_for_layer(pos_tensor, neg_tensor)
+            # Create steering plan
+            steering_plan = SteeringPlan.from_raw(
+                raw={layer_str: steering_vector},
+                scale=strength,
+                normalize=False
+            )
+            # Evaluate on test set
+            correct = 0
+            total = 0
+            for pair in test_pairs.pairs:
+                try:
+                    choices = [pair.negative_response.model_response, pair.positive_response.model_response]
+                    expected = pair.positive_response.model_response
+                    test_code = pair.metadata.get("test_code") if pair.metadata else None
+                    eval_result = evaluator.evaluate(
+                        response="",
+                        expected=expected,
+                        model=model,
+                        question=pair.prompt,
+                        choices=choices,
+                        steering_plan=steering_plan,
+                        test_code=test_code,
+                        task_name=task_name,
+                    )
+                    if eval_result.ground_truth == "TRUTHFUL":
+                        correct += 1
+                    total += 1
+                except Exception:
+                    total += 1
+            accuracy = correct / total if total > 0 else 0.0
+            return accuracy
+        except Exception as e:
+            print(f"      Trial {trial.number} failed: {e}")
+            return 0.0
+    # Create and run study
+    sampler = TPESampler(seed=42, n_startup_trials=n_startup_trials)
+    pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=3)
+    study = optuna.create_study(
+        direction="maximize",
+        sampler=sampler,
+        pruner=pruner,
+    )
+    print(f"  🔍 Running Optuna optimization ({n_trials} trials)...")
+    # Suppress Optuna logs for cleaner output
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
+    best_trial = study.best_trial
+    return {
+        "best_score": best_trial.value,
+        "best_layer": best_trial.params["layer"],
+        "best_strength": best_trial.params["strength"],
+        "best_strategy": best_trial.params["strategy"],
+        "best_token_aggregation": best_trial.params["token_aggregation"],
+        "n_trials": len(study.trials),
+        "search_strategy": "optuna",
+    }
+def execute_optimize_steering(args):
+    """
+    Execute the optimize-steering command.
+    Supports multiple subcommands:
+    - comprehensive: Run comprehensive steering optimization
+    - compare-methods: Compare different steering methods
+    - optimize-layer: Find optimal steering layer
+    - optimize-strength: Find optimal steering strength
+    - auto: Automatically optimize based on classification config
+    """
+    from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
+    from wisent.core.models.wisent_model import WisentModel
+    # Check which subcommand was called
+    if not hasattr(args, "steering_action") or args.steering_action is None:
+        print("\n✗ No steering optimization action specified")
+        print("Available actions: comprehensive, compare-methods, optimize-layer, optimize-strength, auto")
+        sys.exit(1)
+    print(f"\n{'=' * 80}")
+    print(f"🎯 STEERING PARAMETER OPTIMIZATION: {args.steering_action.upper()}")
+    print(f"{'=' * 80}")
+    print(f"   Model: {args.model}")
+    print(f"   Device: {args.device or 'auto'}")
+    print(f"{'=' * 80}\n")
+    # Load model
+    print("📦 Loading model...")
+    model = WisentModel(args.model, device=args.device)
+    print(f"   ✓ Model loaded with {model.num_layers} layers\n")
+    # Initialize data loader
+    loader = LMEvalDataLoader()
+    # Execute based on subcommand and return results
+    if args.steering_action == "comprehensive":
+        return execute_comprehensive(args, model, loader)
+    if args.steering_action == "compare-methods":
+        return execute_compare_methods(args, model, loader)
+    if args.steering_action == "optimize-layer":
+        return execute_optimize_layer(args, model, loader)
+    if args.steering_action == "optimize-strength":
+        return execute_optimize_strength(args, model, loader)
+    if args.steering_action == "auto":
+        return execute_auto(args, model, loader)
+    if args.steering_action == "personalization":
+        return execute_personalization(args, model)
+    if args.steering_action == "multi-personalization":
+        return execute_multi_personalization(args, model)
+    print(f"\n✗ Unknown steering action: {args.steering_action}")
+    sys.exit(1)
+def execute_comprehensive(args, model, loader):
+    """Execute comprehensive steering optimization with generation-based evaluation."""
+    import torch
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
+    from wisent.core.models.core.atoms import SteeringPlan
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    from wisent.core.cli.steering_search_space import (
+        get_search_space_from_args,
+        print_search_space_summary,
+        CAASearchSpace,
+        PRISMSearchSpace,
+        PULSESearchSpace,
+        TITANSearchSpace,
+    )
+    print("🔍 Running comprehensive steering optimization...")
+    print("   Optimizing: Method-specific search space (layer, strength, strategy, + method params)")
+    # Determine tasks to optimize
+    if args.tasks:
+        task_list = args.tasks
+    else:
+        task_list = ["arc_easy", "hellaswag", "winogrande", "gsm8k"]
+    # Check for cached results if --use-cached is specified
+    use_cached = getattr(args, "use_cached", False)
+    save_as_default = getattr(args, "save_as_default", False)
+    if use_cached:
+        print("\n📦 Checking optimization cache...")
+        cached_results = {}
+        tasks_to_run = []
+        for task_name in task_list:
+            for method in args.methods:
+                cached = get_cached_optimization(args.model, task_name, method)
+                if cached:
+                    print(
+                        f"   ✓ Found cached result for {task_name}/{method}: layer={cached.layer}, strength={cached.strength}, score={cached.score:.3f}"
+                    )
+                    cached_results[f"{task_name}::{method}"] = cached
+                else:
+                    if task_name not in tasks_to_run:
+                        tasks_to_run.append(task_name)
+        if cached_results and not tasks_to_run:
+            print("\n✅ All tasks have cached results. Returning cached configurations.")
+            # Convert cached results to the expected return format
+            all_results = {}
+            for key, cached in cached_results.items():
+                task_name, method = key.split("::")
+                if task_name not in all_results:
+                    all_results[task_name] = {}
+                all_results[task_name][method] = {
+                    "best_layer": cached.layer,
+                    "best_strength": cached.strength,
+                    "best_score": cached.score,
+                    "token_aggregation": cached.token_aggregation,
+                    "prompt_strategy": cached.prompt_strategy,
+                    "from_cache": True,
+                }
+            return all_results
+        if tasks_to_run:
+            print(f"   Tasks needing optimization: {', '.join(tasks_to_run)}")
+            task_list = tasks_to_run
+    print(f"   Tasks: {', '.join(task_list)}")
+    print(f"   Methods: {', '.join(args.methods)}")
+    print(f"   Limit: {args.limit} samples per task")
+    quick_search = getattr(args, 'quick_search', False)
+    print(f"   Quick search: {quick_search}")
+    # Search strategy
+    search_strategy = getattr(args, 'search_strategy', 'grid')
+    n_trials = getattr(args, 'n_trials', 50)
+    print(f"   Search strategy: {search_strategy}" + (f" ({n_trials} trials)" if search_strategy == "optuna" else ""))
+    print("   Time limit: DISABLED (no time limit)\n")
+    all_results = {}
+    # Get search spaces for each method and print summary
+    method_search_spaces = {}
+    total_all_methods = 0
+    for method_name in args.methods:
+        search_space = get_search_space_from_args(method_name, args, model.num_layers)
+        method_search_spaces[method_name] = search_space
+        print_search_space_summary(search_space, method_name)
+        total_all_methods += search_space.get_total_configs()
+    print(f"\n   Total configurations across all methods: {total_all_methods:,}\n")
+    # For backward compatibility, also set up the legacy variables
+    # These are used by some code paths that haven't been fully migrated
+    first_method = args.methods[0] if args.methods else "CAA"
+    first_space = method_search_spaces.get(first_method)
+    if isinstance(first_space, (CAASearchSpace, PRISMSearchSpace)):
+        layers_to_test = first_space.layers
+    else:
+        # PULSE/TITAN don't use direct layers, compute defaults
+        layers_to_test = list(range(model.num_layers // 2, model.num_layers - 2, 2))
+    strengths_to_test = first_space.strengths if first_space else [0.5, 1.0, 1.5, 2.0]
+    strategies_to_test = first_space.strategies if first_space else ["constant", "initial_only", "diminishing"]
+    # Convert string token aggregations to enum
+    token_agg_map = {
+        "last_token": ActivationAggregationStrategy.LAST_TOKEN,
+        "mean_pooling": ActivationAggregationStrategy.MEAN_POOLING,
+        "first_token": ActivationAggregationStrategy.FIRST_TOKEN,
+        "max_pooling": ActivationAggregationStrategy.MAX_POOLING,
+        "choice_token": ActivationAggregationStrategy.CHOICE_TOKEN,
+        "continuation_token": ActivationAggregationStrategy.CONTINUATION_TOKEN,
+    }
+    token_aggregations_to_test = [
+        token_agg_map.get(t, ActivationAggregationStrategy.LAST_TOKEN)
+        for t in (first_space.token_aggregations if first_space else ["last_token", "mean_pooling"])
+    ]
+    # Convert string prompt constructions to enum
+    prompt_const_map = {
+        "chat_template": PromptConstructionStrategy.CHAT_TEMPLATE,
+        "direct_completion": PromptConstructionStrategy.DIRECT_COMPLETION,
+        "multiple_choice": PromptConstructionStrategy.MULTIPLE_CHOICE,
+        "role_playing": PromptConstructionStrategy.ROLE_PLAYING,
+        "instruction_following": PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
+    }
+    prompt_constructions_to_test = [
+        prompt_const_map.get(p, PromptConstructionStrategy.CHAT_TEMPLATE)
+        for p in (first_space.prompt_constructions if first_space else ["chat_template", "direct_completion"])
+    ]
+    # For legacy code paths
+    total_configs = first_space.get_total_configs() if first_space else 100
+    for task_idx, task_name in enumerate(task_list, 1):
+        print(f"\n{'=' * 80}")
+        print(f"Task {task_idx}/{len(task_list)}: {task_name}")
+        print(f"{'=' * 80}")
+        task_start_time = time.time()
+        try:
+            # Load task data
+            print("  📊 Loading task data...")
+            result = loader._load_one_task(
+                task_name=task_name, split_ratio=0.8, seed=42, limit=args.limit, training_limit=None, testing_limit=None
+            )
+            train_pairs = result["train_qa_pairs"]
+            test_pairs = result["test_qa_pairs"]
+            print(f"      ✓ Loaded {len(train_pairs.pairs)} train, {len(test_pairs.pairs)} test pairs")
+            # Initialize evaluator for this task (auto-select based on task_name)
+            EvaluatorRotator.discover_evaluators("wisent.core.evaluators.benchmark_specific")
+            evaluator = EvaluatorRotator(evaluator=None, task_name=task_name)  # None = auto-select
+            print(f"      ✓ Using evaluator: {evaluator._plugin.name} (auto-selected for {task_name})")
+            # Compute baseline (unsteered) results if requested
+            baseline_results = {}
+            if hasattr(args, "compute_baseline") and args.compute_baseline:
+                print("\n  📊 Computing BASELINE (unsteered) accuracy...")
+                baseline_scores = []
+                baseline_per_problem = []
+                for pair_idx, pair in enumerate(test_pairs.pairs):
+                    try:
+                        # Prepare choices for multiple choice evaluation
+                        choices = [pair.negative_response.model_response, pair.positive_response.model_response]
+                        expected = pair.positive_response.model_response
+                        # Evaluate WITHOUT steering
+                        test_code = pair.metadata.get("test_code") if pair.metadata else None
+                        eval_result = evaluator.evaluate(
+                            response="",
+                            expected=expected,
+                            model=model,
+                            question=pair.prompt,
+                            choices=choices,
+                            steering_plan=None,  # No steering for baseline
+                            test_code=test_code,
+                            task_name=task_name,
+                        )
+                        is_correct = eval_result.ground_truth == "TRUTHFUL"
+                        baseline_scores.append(1.0 if is_correct else 0.0)
+                        # Store per-problem baseline result with details
+                        baseline_per_problem.append(
+                            {
+                                "pair_index": pair_idx,
+                                "prompt": pair.prompt,
+                                "expected": expected,
+                                "baseline_correct": is_correct,
+                                "ground_truth": eval_result.ground_truth,
+                                "method_used": eval_result.method_used,
+                                "confidence": eval_result.confidence,
+                            }
+                        )
+                        if (pair_idx + 1) % 10 == 0:
+                            print(
+                                f"      Evaluated {pair_idx + 1}/{len(test_pairs.pairs)} baseline samples...", end="\r"
+                            )
+                    except Exception as e:
+                        print(f"\n❌ Baseline evaluation failed for pair {pair_idx}:")
+                        print(f"   Error: {e}")
+                        raise
+                baseline_accuracy = np.mean(baseline_scores) if baseline_scores else 0.0
+                print(
+                    f"\n      ✓ Baseline accuracy: {baseline_accuracy:.3f} ({sum(baseline_scores):.0f}/{len(baseline_scores)} correct)"
+                )
+                baseline_results = {
+                    "accuracy": baseline_accuracy,
+                    "per_problem": baseline_per_problem,
+                    "num_correct": int(sum(baseline_scores)),
+                    "num_total": len(baseline_scores),
+                }
+            # Dispatch based on search strategy
+            if search_strategy == "optuna":
+                # Use Optuna-based search
+                optuna_result = _run_optuna_search_for_task(
+                    model=model,
+                    train_pairs=train_pairs,
+                    test_pairs=test_pairs,
+                    evaluator=evaluator,
+                    task_name=task_name,
+                    search_space=first_space,
+                    args=args,
+                    baseline_results=baseline_results,
+                )
+                best_score = optuna_result["best_score"]
+                best_config = {
+                    "layer": optuna_result["best_layer"],
+                    "strength": optuna_result["best_strength"],
+                    "strategy": optuna_result["best_strategy"],
+                    "token_aggregation": optuna_result["best_token_aggregation"],
+                }
+                print(f"      Best: layer={best_config['layer']}, strength={best_config['strength']:.2f}, "
+                      f"strategy={best_config['strategy']}, token_agg={best_config['token_aggregation']}")
+                print(f"      Score: {best_score:.4f} (from {optuna_result['n_trials']} trials)")
+                # Store results in format compatible with grid search
+                method_results = {
+                    first_method: {
+                        "best_score": best_score,
+                        "best_layer": best_config["layer"],
+                        "best_strength": best_config["strength"],
+                        "best_strategy": best_config["strategy"],
+                        "token_aggregation": best_config["token_aggregation"],
+                        "search_strategy": "optuna",
+                    }
+                }
+                # Skip the grid search loop - jump to result saving
+                all_results[task_name] = method_results
+                if not args.no_save:
+                    save_steering_config(
+                        model_name=args.model,
+                        task=task_name,
+                        layer=best_config["layer"],
+                        strength=best_config["strength"],
+                        method=first_method,
+                        strategy=best_config["strategy"],
+                        token_aggregation=best_config["token_aggregation"],
+                    )
+                    store_optimization(
+                        model=args.model,
+                        task=task_name,
+                        layer=best_config["layer"],
+                        strength=best_config["strength"],
+                        method=first_method,
+                        strategy=best_config["strategy"],
+                        score=best_score,
+                        metric="accuracy",
+                    )
+                continue  # Skip to next task
+            # Grid search (original behavior)
+            print(
+                "\n  🔍 Testing CAA method across layers, strengths, strategies, token aggregations, prompt constructions..."
+            )
+            print(f"      Total configurations: {total_configs}")
+            best_score = 0
+            best_config = None
+            method_results = {}
+            configs_tested = 0
+            all_generation_examples = []  # Store generation examples for all configs
+            # Prepare test prompts if generating examples for all configs
+            if args.save_all_generation_examples or args.save_generation_examples:
+                num_examples = min(args.num_generation_examples, len(test_pairs.pairs))
+                example_pairs = test_pairs.pairs[:num_examples]
+                print(f"  📝 Will generate {num_examples} example responses per configuration")
+            for layer in layers_to_test:
+                for strength in strengths_to_test:
+                    for strategy in strategies_to_test:
+                        for token_agg in token_aggregations_to_test:
+                            for prompt_const in prompt_constructions_to_test:
+                                # Time limit disabled - run all configurations
+                                try:
+                                    configs_tested += 1
+                                    layer_str = str(layer)
+                                    # Step 1: Generate steering vector using CAA with current token aggregation
+                                    collector = ActivationCollector(model=model, store_device="cpu")
+                                    pos_acts = []
+                                    neg_acts = []
+                                    for pair in train_pairs.pairs:
+                                        updated_pair = collector.collect_for_pair(
+                                            pair,
+                                            layers=[layer_str],
+                                            aggregation=token_agg,  # Use current token aggregation strategy
+                                            return_full_sequence=False,
+                                            normalize_layers=False,
+                                        )
+                                        if (
+                                            updated_pair.positive_response.layers_activations
+                                            and layer_str in updated_pair.positive_response.layers_activations
+                                        ):
+                                            act = updated_pair.positive_response.layers_activations[layer_str]
+                                            if act is not None:
+                                                pos_acts.append(act)
+                                        if (
+                                            updated_pair.negative_response.layers_activations
+                                            and layer_str in updated_pair.negative_response.layers_activations
+                                        ):
+                                            act = updated_pair.negative_response.layers_activations[layer_str]
+                                            if act is not None:
+                                                neg_acts.append(act)
+                                    if len(pos_acts) == 0 or len(neg_acts) == 0:
+                                        continue
+                                    # Create CAA steering vector
+                                    # Use the selected method (first from args.methods or default to CAA)
+                                    method_name = args.methods[0] if args.methods else "CAA"
+                                    steering_method = create_steering_method(method_name, args)
+                                    steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
+                                    # Step 2: Evaluate with ACTUAL GENERATION and task evaluator
+                                    # Create steering plan
+                                    from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+                                    steering_vec = SteeringVector(vector=steering_vector, scale=strength)
+                                    steering_plan = SteeringPlan(
+                                        layers={layer_str: steering_vec},
+                                        layers_description=[
+                                            f"CAA L{layer} S{strength} {strategy} T:{token_agg.value} P:{prompt_const.value}"
+                                        ],
+                                    )
+                                    # Apply steering to model
+                                    model.apply_steering(steering_plan)
+                                    test_scores = []
+                                    detailed_results = []  # Store full evaluation details
+                                    delta_tracking = []  # Track improved/regressed/unchanged per problem
+                                    for pair_idx, pair in enumerate(test_pairs.pairs):
+                                        try:
+                                            # Prepare choices for multiple choice evaluation
+                                            # ContrastivePair uses: prompt, positive_response.model_response, negative_response.model_response
+                                            choices = [
+                                                pair.negative_response.model_response,
+                                                pair.positive_response.model_response,
+                                            ]
+                                            expected = pair.positive_response.model_response
+                                            # Use the Wisent evaluator to check correctness
+                                            # The evaluator will use log likelihood if possible,
+                                            # otherwise fall back to generation
+                                            # Pass test_code from metadata for coding tasks
+                                            test_code = pair.metadata.get("test_code") if pair.metadata else None
+                                            eval_result = evaluator.evaluate(
+                                                response="",  # Not used for log likelihood eval
+                                                expected=expected,
+                                                model=model,
+                                                question=pair.prompt,
+                                                choices=choices,
+                                                steering_plan=steering_plan,
+                                                test_code=test_code,
+                                                task_name=task_name,
+                                            )
+                                            # Convert TRUTHFUL/UNTRUTHFUL to 1.0/0.0
+                                            is_correct = eval_result.ground_truth == "TRUTHFUL"
+                                            test_scores.append(1.0 if is_correct else 0.0)
+                                            # Save full evaluation details
+                                            detailed_results.append(
+                                                {
+                                                    "prompt": pair.prompt,
+                                                    "choices": choices,
+                                                    "expected": expected,
+                                                    "ground_truth": eval_result.ground_truth,
+                                                    "method_used": eval_result.method_used,
+                                                    "confidence": eval_result.confidence,
+                                                    "details": eval_result.details,
+                                                    "meta": dict(eval_result.meta) if eval_result.meta else {},
+                                                    "is_correct": is_correct,
+                                                }
+                                            )
+                                            # Track delta if baseline was computed
+                                            if baseline_results and "per_problem" in baseline_results:
+                                                baseline_correct = baseline_results["per_problem"][pair_idx][
+                                                    "baseline_correct"
+                                                ]
+                                                if not baseline_correct and is_correct:
+                                                    delta_status = "improved"
+                                                elif baseline_correct and not is_correct:
+                                                    delta_status = "regressed"
+                                                else:
+                                                    delta_status = "unchanged"
+                                                delta_tracking.append(
+                                                    {
+                                                        "pair_index": pair_idx,
+                                                        "prompt": pair.prompt,
+                                                        "expected": expected,
+                                                        "baseline_correct": baseline_correct,
+                                                        "steered_correct": is_correct,
+                                                        "delta_status": delta_status,
+                                                    }
+                                                )
+                                        except Exception as e:
+                                            # NO FALLBACK - raise the error immediately
+                                            print("\n❌ Evaluation failed for test pair:")
+                                            print(f"   Prompt: {pair.prompt[:100]}")
+                                            print(f"   Error: {e}")
+                                            raise
+                                    # Clear steering
+                                    model.clear_steering()
+                                    if len(test_scores) > 0:
+                                        avg_score = np.mean(test_scores)
+                                        # Generate examples for this configuration if requested
+                                        if args.save_all_generation_examples:
+                                            config_examples = []
+                                            # Get inference config settings
+                                            for idx, pair in enumerate(example_pairs):
+                                                prompt = pair.prompt
+                                                try:
+                                                    # Generate without steering (only once per prompt, reuse if already generated)
+                                                    unsteered_response = model.generate(
+                                                        [[{"role": "user", "content": prompt}]],
+                                                        **get_generate_kwargs(max_new_tokens=100),
+                                                        use_steering=False,
+                                                    )[0]
+                                                    # Create steering plan for this config
+                                                    from wisent.core.models.core.atoms import (
+                                                        SteeringPlan,
+                                                        SteeringVector,
+                                                    )
+                                                    steering_vec = SteeringVector(
+                                                        vector=steering_vector, scale=strength
+                                                    )
+                                                    steering_plan = SteeringPlan(
+                                                        layers={layer_str: steering_vec},
+                                                        layers_description=[
+                                                            f"CAA steering layer={layer}, strength={strength}, strategy={strategy}"
+                                                        ],
+                                                    )
+                                                    # Generate with steering
+                                                    model.apply_steering(steering_plan)
+                                                    steered_response = model.generate(
+                                                        [[{"role": "user", "content": prompt}]],
+                                                        **get_generate_kwargs(max_new_tokens=100),
+                                                        use_steering=True,
+                                                        steering_plan=steering_plan,
+                                                    )[0]
+                                                    model.clear_steering()
+                                                    config_examples.append(
+                                                        {
+                                                            "prompt": prompt,
+                                                            "correct_answer": pair.positive_response.model_response,
+                                                            "incorrect_answer": pair.negative_response.model_response,
+                                                            "unsteered_generation": unsteered_response,
+                                                            "steered_generation": steered_response,
+                                                        }
+                                                    )
+                                                except Exception as e:
+                                                    if args.verbose:
+                                                        print(
+                                                            f"      ⚠️ Failed to generate example for config layer={layer}, strength={strength}, strategy={strategy}: {e}"
+                                                        )
+                                            # Store this config's examples
+                                            all_generation_examples.append(
+                                                {
+                                                    "layer": layer,
+                                                    "strength": strength,
+                                                    "strategy": strategy,
+                                                    "accuracy": avg_score,
+                                                    "examples": config_examples,
+                                                }
+                                            )
+                                        # Compute delta summary if baseline was computed
+                                        delta_summary = {}
+                                        if delta_tracking:
+                                            improved = sum(1 for d in delta_tracking if d["delta_status"] == "improved")
+                                            regressed = sum(
+                                                1 for d in delta_tracking if d["delta_status"] == "regressed"
+                                            )
+                                            unchanged = sum(
+                                                1 for d in delta_tracking if d["delta_status"] == "unchanged"
+                                            )
+                                            delta_summary = {
+                                                "improved": improved,
+                                                "regressed": regressed,
+                                                "unchanged": unchanged,
+                                                "net_change": improved - regressed,
+                                            }
+                                        # Store detailed results for this configuration
+                                        config_key = (
+                                            f"L{layer}_S{strength}_{strategy}_{token_agg.value}_{prompt_const.value}"
+                                        )
+                                        method_results[config_key] = {
+                                            "layer": layer,
+                                            "strength": strength,
+                                            "strategy": strategy,
+                                            "token_aggregation": token_agg.value,
+                                            "prompt_construction": prompt_const.value,
+                                            "accuracy": avg_score,
+                                            "num_test_samples": len(test_scores),
+                                            "detailed_results": detailed_results,  # Save all eval details
+                                            "delta_tracking": delta_tracking if delta_tracking else None,
+                                            "delta_summary": delta_summary if delta_summary else None,
+                                        }
+                                        if avg_score > best_score:
+                                            best_score = avg_score
+                                            best_config = {
+                                                "layer": layer,
+                                                "strength": strength,
+                                                "strategy": strategy,
+                                                "token_aggregation": token_agg.value,
+                                                "prompt_construction": prompt_const.value,
+                                                "accuracy": avg_score,
+                                            }
+                                    if configs_tested % 10 == 0 and args.verbose:
+                                        print(f"      Tested {configs_tested} configurations...", end="\r")
+                                except Exception as e:
+                                    # NO FALLBACK - raise the error immediately
+                                    print("\n❌ Configuration test failed:")
+                                    print(f"   Layer: {layer}")
+                                    print(f"   Strength: {strength}")
+                                    print(f"   Strategy: {strategy}")
+                                    print(f"   Error: {e}")
+                                    raise
+            if best_config:
+                print("\n  ✅ Best configuration found:")
+                print("      Method: CAA")
+                print(f"      Layer: {best_config['layer']}")
+                print(f"      Strength: {best_config['strength']}")
+                print(f"      Strategy: {best_config['strategy']} ⭐")
+                print(f"      Token Aggregation: {best_config['token_aggregation']}")
+                print(f"      Prompt Construction: {best_config['prompt_construction']}")
+                print(f"      Accuracy: {best_config['accuracy']:.3f}")
+                method_results["CAA"] = {
+                    "optimal_layer": best_config["layer"],
+                    "optimal_strength": best_config["strength"],
+                    "optimal_strategy": best_config["strategy"],
+                    "optimal_token_aggregation": best_config["token_aggregation"],
+                    "optimal_prompt_construction": best_config["prompt_construction"],
+                    "accuracy": best_config["accuracy"],
+                    "f1": best_config["accuracy"],
+                }
+                # Save baseline comparison results if computed
+                if hasattr(args, "compute_baseline") and args.compute_baseline and baseline_results:
+                    import os
+                    baseline_dir = (
+                        args.baseline_output_dir if hasattr(args, "baseline_output_dir") else "./baseline_comparison"
+                    )
+                    os.makedirs(baseline_dir, exist_ok=True)
+                    # Get delta tracking for best config
+                    best_config_key = f"L{best_config['layer']}_S{best_config['strength']}_{best_config['strategy']}_{best_config['token_aggregation']}_{best_config['prompt_construction']}"
+                    best_config_results = method_results.get(best_config_key, {})
+                    best_delta_tracking = best_config_results.get("delta_tracking", [])
+                    best_delta_summary = best_config_results.get("delta_summary", {})
+                    # Separate improved, regressed, unchanged for inspection
+                    improved_examples = [d for d in best_delta_tracking if d.get("delta_status") == "improved"]
+                    regressed_examples = [d for d in best_delta_tracking if d.get("delta_status") == "regressed"]
+                    unchanged_examples = [d for d in best_delta_tracking if d.get("delta_status") == "unchanged"]
+                    baseline_comparison_data = {
+                        "task": task_name,
+                        "model": args.model,
+                        "baseline_accuracy": baseline_results["accuracy"],
+                        "best_steered_accuracy": best_config["accuracy"],
+                        "delta": best_config["accuracy"] - baseline_results["accuracy"],
+                        "best_config": best_config,
+                        "summary": best_delta_summary,
+                        "improved_examples": improved_examples,
+                        "regressed_examples": regressed_examples,
+                        "unchanged_examples": unchanged_examples,
+                        "baseline_per_problem": baseline_results["per_problem"],
+                    }
+                    comparison_path = os.path.join(baseline_dir, f"{task_name}_baseline_comparison.json")
+                    with open(comparison_path, "w") as f:
+                        json.dump(baseline_comparison_data, f, indent=2)
+                    print("\n  📊 Baseline Comparison Summary:")
+                    print(f"      Baseline (unsteered) accuracy: {baseline_results['accuracy']:.3f}")
+                    print(f"      Best steered accuracy: {best_config['accuracy']:.3f}")
+                    print(f"      Delta: {(best_config['accuracy'] - baseline_results['accuracy']) * 100:+.1f}%")
+                    if best_delta_summary:
+                        print(f"      Improved: {best_delta_summary.get('improved', 0)} problems")
+                        print(f"      Regressed: {best_delta_summary.get('regressed', 0)} problems")
+                        print(f"      Unchanged: {best_delta_summary.get('unchanged', 0)} problems")
+                        print(f"      Net change: {best_delta_summary.get('net_change', 0)} problems")
+                    print(f"      💾 Saved comparison to: {comparison_path}")
+                # Save best steering vector if requested
+                if args.save_best_vector:
+                    import os
+                    vector_dir = args.save_best_vector
+                    os.makedirs(vector_dir, exist_ok=True)
+                    # Recreate the best steering vector with optimal token aggregation
+                    best_layer_str = str(best_config["layer"])
+                    best_token_agg = ActivationAggregationStrategy(best_config["token_aggregation"])
+                    pos_acts_best = []
+                    neg_acts_best = []
+                    for pair in train_pairs.pairs:
+                        updated_pair = collector.collect_for_pair(
+                            pair,
+                            layers=[best_layer_str],
+                            aggregation=best_token_agg,  # Use optimal token aggregation
+                            return_full_sequence=False,
+                            normalize_layers=False,
+                        )
+                        if (
+                            updated_pair.positive_response.layers_activations
+                            and best_layer_str in updated_pair.positive_response.layers_activations
+                        ):
+                            act = updated_pair.positive_response.layers_activations[best_layer_str]
+                            if act is not None:
+                                pos_acts_best.append(act)
+                        if (
+                            updated_pair.negative_response.layers_activations
+                            and best_layer_str in updated_pair.negative_response.layers_activations
+                        ):
+                            act = updated_pair.negative_response.layers_activations[best_layer_str]
+                            if act is not None:
+                                neg_acts_best.append(act)
+                    # Create and save steering vector
+                    method_name = args.methods[0] if args.methods else "CAA"
+                    steering_method = create_steering_method(method_name, args)
+                    best_steering_vector = steering_method.train_for_layer(pos_acts_best, neg_acts_best)
+                    vector_path = os.path.join(vector_dir, f"{task_name}_layer{best_config['layer']}.pt")
+                    torch.save(
+                        {
+                            "steering_vector": best_steering_vector,
+                            "vector": best_steering_vector,  # Legacy key
+                            "layer": best_config["layer"],
+                            "layer_index": best_config["layer"],  # Legacy key
+                            "strength": best_config["strength"],
+                            "strategy": best_config["strategy"],
+                            "token_aggregation": best_config["token_aggregation"],
+                            "prompt_construction": best_config["prompt_construction"],
+                            "method": "CAA",
+                            "task": task_name,
+                            "model": args.model,
+                            "accuracy": best_config["accuracy"],
+                        },
+                        vector_path,
+                    )
+                    print(f"      💾 Saved steering vector to: {vector_path}")
+                # Save generation examples
+                if args.save_all_generation_examples:
+                    # Save examples for ALL configurations
+                    examples_path = os.path.join(
+                        args.save_best_vector if args.save_best_vector else "./optimization_results",
+                        f"{task_name}_all_generation_examples.json",
+                    )
+                    os.makedirs(os.path.dirname(examples_path), exist_ok=True)
+                    with open(examples_path, "w") as f:
+                        json.dump(
+                            {
+                                "task": task_name,
+                                "model": args.model,
+                                "best_config": best_config,
+                                "configurations": all_generation_examples,
+                            },
+                            f,
+                            indent=2,
+                        )
+                    print(
+                        f"\n  💾 Saved generation examples for {len(all_generation_examples)} configurations to: {examples_path}"
+                    )
+                # Generate examples for --save-generation-examples, --show-comparisons, or --save-comparisons
+                show_comparisons = getattr(args, 'show_comparisons', 0)
+                save_comparisons = getattr(args, 'save_comparisons', None)
+                need_generation = args.save_generation_examples or show_comparisons > 0 or save_comparisons
+                if need_generation:
+                    # Save examples only for the best configuration
+                    print("\n  📝 Generating example responses for best configuration...")
+                    # Get a few test examples to generate from
+                    num_examples = min(args.num_generation_examples, len(test_pairs.pairs))
+                    example_pairs = test_pairs.pairs[:num_examples]
+                    generation_examples = []
+                    # Get inference config settings
+                    gen_kwargs = get_generate_kwargs()
+                    for idx, pair in enumerate(example_pairs):
+                        # Create prompt from the question
+                        prompt = pair.prompt
+                        try:
+                            # Generate without steering
+                            unsteered_response = model.generate(
+                                [[{"role": "user", "content": prompt}]],
+                                **get_generate_kwargs(max_new_tokens=100),
+                                use_steering=False,
+                            )[0]
+                            # Recreate best steering vector for generation
+                            best_layer_str = str(best_config["layer"])
+                            pos_acts_gen = []
+                            neg_acts_gen = []
+                            # Collect activations again for steering
+                            for train_pair in train_pairs.pairs[:20]:  # Use subset for speed
+                                updated_pair = collector.collect_for_pair(
+                                    train_pair,
+                                    layers=[best_layer_str],
+                                    aggregation=ActivationAggregationStrategy.MEAN_POOLING,
+                                    return_full_sequence=False,
+                                    normalize_layers=False,
+                                )
+                                if (
+                                    updated_pair.positive_response.layers_activations
+                                    and best_layer_str in updated_pair.positive_response.layers_activations
+                                ):
+                                    act = updated_pair.positive_response.layers_activations[best_layer_str]
+                                    if act is not None:
+                                        pos_acts_gen.append(act)
+                                if (
+                                    updated_pair.negative_response.layers_activations
+                                    and best_layer_str in updated_pair.negative_response.layers_activations
+                                ):
+                                    act = updated_pair.negative_response.layers_activations[best_layer_str]
+                                    if act is not None:
+                                        neg_acts_gen.append(act)
+                            # Create steering vector
+                            method_name_gen = args.methods[0] if args.methods else "CAA"
+                            steering_method_gen = create_steering_method(method_name_gen, args)
+                            steering_vector_gen = steering_method_gen.train_for_layer(pos_acts_gen, neg_acts_gen)
+                            # Create SteeringPlan
+                            from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+                            steering_vec = SteeringVector(vector=steering_vector_gen, scale=best_config["strength"])
+                            steering_plan = SteeringPlan(
+                                layers={best_layer_str: steering_vec},
+                                layers_description=[f"CAA steering for {task_name}"],
+                            )
+                            # Generate with steering
+                            model.apply_steering(steering_plan)
+                            steered_response = model.generate(
+                                [[{"role": "user", "content": prompt}]],
+                                **get_generate_kwargs(max_new_tokens=100),
+                                use_steering=True,
+                                steering_plan=steering_plan,
+                            )[0]
+                            model.detach()
+                            generation_examples.append(
+                                {
+                                    "question": prompt,
+                                    "correct_answer": pair.positive_response.model_response,
+                                    "incorrect_answer": pair.negative_response.model_response,
+                                    "unsteered_generation": unsteered_response,
+                                    "steered_generation": steered_response,
+                                }
+                            )
+                            print(f"      Generated example {idx + 1}/{num_examples}")
+                        except Exception as e:
+                            print(f"      ⚠️ Failed to generate example {idx + 1}: {e}")
+                            if args.verbose:
+                                import traceback
+                                traceback.print_exc()
+                    # Save examples to JSON only if --save-generation-examples is set
+                    if args.save_generation_examples:
+                        examples_path = os.path.join(
+                            args.save_best_vector if args.save_best_vector else "./optimization_results",
+                            f"{task_name}_generation_examples.json",
+                        )
+                        os.makedirs(os.path.dirname(examples_path), exist_ok=True)
+                        with open(examples_path, "w") as f:
+                            json.dump(
+                                {
+                                    "task": task_name,
+                                    "model": args.model,
+                                    "best_config": best_config,
+                                    "examples": generation_examples,
+                                },
+                                f,
+                                indent=2,
+                            )
+                        print(f"      💾 Saved {len(generation_examples)} generation examples to: {examples_path}")
+                    # Handle --show-comparisons and --save-comparisons flags
+                    if (show_comparisons > 0 or save_comparisons) and generation_examples:
+                        # Build comparisons list from generation_examples
+                        comparisons = []
+                        for ex in generation_examples:
+                            comparisons.append({
+                                "prompt": ex["question"],
+                                "baseline_response": ex["unsteered_generation"],
+                                "optimized_response": ex["steered_generation"],
+                                "correct_answer": ex.get("correct_answer", ""),
+                                "incorrect_answer": ex.get("incorrect_answer", ""),
+                            })
+                        # Save to JSON if requested
+                        if save_comparisons:
+                            os.makedirs(os.path.dirname(save_comparisons) if os.path.dirname(save_comparisons) else ".", exist_ok=True)
+                            with open(save_comparisons, "w") as f:
+                                json.dump({
+                                    "model": args.model,
+                                    "task": task_name,
+                                    "best_config": best_config,
+                                    "comparisons": comparisons,
+                                }, f, indent=2)
+                            print(f"      💾 Saved comparisons to: {save_comparisons}")
+                        # Display in console if requested
+                        if show_comparisons > 0:
+                            print(f"\n  📊 Top {min(show_comparisons, len(comparisons))} Baseline vs Optimized Comparisons:\n")
+                            for i, comp in enumerate(comparisons[:show_comparisons]):
+                                print(f"{'─'*80}")
+                                print(f"Comparison {i+1}/{min(show_comparisons, len(comparisons))}")
+                                print(f"{'─'*80}")
+                                print(f"PROMPT: {comp['prompt'][:200]}{'...' if len(comp['prompt']) > 200 else ''}")
+                                print()
+                                print(f"BASELINE (unsteered):")
+                                print(f"  {comp['baseline_response'][:300]}{'...' if len(comp['baseline_response']) > 300 else ''}")
+                                print()
+                                print(f"OPTIMIZED (steered):")
+                                print(f"  {comp['optimized_response'][:300]}{'...' if len(comp['optimized_response']) > 300 else ''}")
+                                print()
+            else:
+                print("\n  ⚠️  No valid configuration found")
+                method_results["CAA"] = {
+                    "optimal_layer": 8,
+                    "optimal_strength": 1.0,
+                    "optimal_strategy": "constant",
+                    "optimal_token_aggregation": "last_token",
+                    "optimal_prompt_construction": "chat_template",
+                    "accuracy": 0.5,
+                    "f1": 0.5,
+                }
+            all_results[task_name] = {
+                "methods": method_results,
+                "best_method": "CAA",
+                "best_layer": method_results["CAA"]["optimal_layer"],
+                "best_strength": method_results["CAA"]["optimal_strength"],
+                "best_strategy": method_results["CAA"]["optimal_strategy"],
+                "best_token_aggregation": method_results["CAA"]["optimal_token_aggregation"],
+                "best_prompt_construction": method_results["CAA"]["optimal_prompt_construction"],
+            }
+            task_time = time.time() - task_start_time
+            print(f"\n  ⏱️  Task completed in {task_time:.1f}s (tested {configs_tested} configurations)")
+        except Exception as e:
+            # NO FALLBACK - raise the error immediately
+            print(f"\n❌ Task '{task_name}' optimization failed:")
+            print(f"   Error: {e}")
+            import traceback
+            traceback.print_exc()
+            raise
+    # Save results
+    print(f"\n{'=' * 80}")
+    print("📊 COMPREHENSIVE OPTIMIZATION COMPLETE")
+    print(f"{'=' * 80}\n")
+    results_file = f"./optimization_results/steering_comprehensive_{args.model.replace('/', '_')}.json"
+    import os
+    os.makedirs(os.path.dirname(results_file), exist_ok=True)
+    output_data = {
+        "model": args.model,
+        "tasks": all_results,
+        "methods_tested": args.methods,
+        "limit": args.limit,
+        "optimization_dimensions": ["layer", "strength", "strategy", "token_aggregation", "prompt_construction"],
+    }
+    with open(results_file, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"✅ Results saved to: {results_file}\n")
+    # Print summary
+    print("📋 SUMMARY BY TASK:")
+    print("-" * 140)
+    for task_name, config in all_results.items():
+        print(
+            f"  {task_name:20s} | L{config['best_layer']:2d} S{config['best_strength']:.1f} | {config['best_strategy']:12s} | T:{config['best_token_aggregation']:12s} | P:{config['best_prompt_construction']:18s}"
+        )
+    print("-" * 140 + "\n")
+    # Store results in optimization cache
+    save_as_default = getattr(args, "save_as_default", False)
+    print("💾 Storing results in optimization cache...")
+    for task_name, config in all_results.items():
+        # Skip results that came from cache
+        if config.get("from_cache"):
+            continue
+        # Get best score from methods if available
+        best_score = 0.0
+        if "methods" in config and "CAA" in config["methods"]:
+            best_score = config["methods"]["CAA"].get("accuracy", 0.0)
+        # Get best method name
+        best_method_name = config.get("best_method", "CAA")
+        # Get method-specific parameters from the best config
+        method_config = config.get("methods", {}).get(best_method_name, {})
+        cache_key = store_optimization(
+            model=args.model,
+            task=task_name,
+            layer=config["best_layer"],
+            strength=config["best_strength"],
+            method=best_method_name,
+            token_aggregation=config.get("best_token_aggregation", "last_token"),
+            prompt_strategy=config.get("best_prompt_construction", "chat_template"),
+            strategy=config.get("best_strategy", "constant"),
+            score=best_score,
+            metric="accuracy",
+            metadata={"limit": args.limit},
+            set_as_default=save_as_default,
+            # PRISM parameters
+            num_directions=method_config.get("num_directions", 1),
+            direction_weighting=method_config.get("direction_weighting", "primary_only"),
+            retain_weight=method_config.get("retain_weight", 0.0),
+            independence_weight=method_config.get("independence_weight", 0.05),
+            prism_optimization_steps=method_config.get("optimization_steps", 100),
+            # PULSE parameters
+            sensor_layer=method_config.get("sensor_layer", -1),
+            steering_layers=method_config.get("steering_layers", ""),
+            condition_threshold=method_config.get("condition_threshold", 0.5),
+            gate_temperature=method_config.get("gate_temperature", 0.5),
+            per_layer_scaling=method_config.get("per_layer_scaling", True),
+            use_entropy_scaling=method_config.get("use_entropy_scaling", False),
+            max_alpha=method_config.get("max_alpha", 2.0),
+            # TITAN parameters
+            gate_hidden_dim=method_config.get("gate_hidden_dim", 64),
+            intensity_hidden_dim=method_config.get("intensity_hidden_dim", 32),
+            behavior_weight=method_config.get("behavior_weight", 1.0),
+            sparse_weight=method_config.get("sparse_weight", 0.05),
+            titan_optimization_steps=method_config.get("titan_optimization_steps", 200),
+            titan_learning_rate=method_config.get("titan_learning_rate", 0.005),
+            # Store all method params as generic dict
+            method_params=method_config,
+        )
+        print(f"   ✓ Cached {task_name}: {cache_key}")
+    if save_as_default:
+        print("   ✓ Results set as default configurations")
+    # Return results for programmatic access
+    return {
+        "model": args.model,
+        "action": "comprehensive",
+        "methods_tested": args.methods,
+        "tasks_optimized": list(all_results.keys()),
+        "results": all_results,
+        "results_file": results_file,
+        "optimization_dimensions": ["layer", "strength", "strategy", "token_aggregation", "prompt_construction"],
+    }
+def get_strategy_weight(strategy: str, position: float) -> float:
+    """
+    Calculate steering weight based on strategy and token position.
+    Args:
+        strategy: Steering strategy name
+        position: Token position as fraction (0.0 = start, 1.0 = end)
+    Returns:
+        Weight multiplier for steering vector
+    """
+    if strategy == "last_only":
+        return 1.0 if position >= 0.9 else 0.0
+    if strategy == "first_only":
+        return 1.0 if position <= 0.1 else 0.0
+    if strategy == "all_equal":
+        return 1.0
+    if strategy == "exponential_decay":
+        return np.exp(-3.0 * position)  # Decay rate of 3
+    if strategy == "exponential_growth":
+        return np.exp(3.0 * position)
+    if strategy == "linear_decay":
+        return 1.0 - position
+    if strategy == "linear_growth":
+        return position
+    return 1.0  # Default to all_equal
+def execute_compare_methods(args, model, loader):
+    """Execute method comparison - currently only CAA is implemented."""
+    import matplotlib.pyplot as plt
+    from wisent_plots import LineChart
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    # Check for cached results if --use-cached is specified
+    use_cached = getattr(args, "use_cached", False)
+    save_as_default = getattr(args, "save_as_default", False)
+    if use_cached:
+        print(f"\n📦 Checking optimization cache for {args.task}...")
+        for method in args.methods:
+            cached = get_cached_optimization(args.model, args.task, method)
+            if cached:
+                print(
+                    f"   ✓ Found cached result for {method}: layer={cached.layer}, strength={cached.strength}, score={cached.score:.3f}"
+                )
+                return {
+                    "model": args.model,
+                    "action": "compare-methods",
+                    "task": args.task,
+                    "best_method": method,
+                    "best_layer": cached.layer,
+                    "best_strength": cached.strength,
+                    "best_score": cached.score,
+                    "from_cache": True,
+                }
+        print("   No cached results found. Running optimization...")
+    print(f"🔍 Comparing steering methods for task: {args.task}\n")
+    print(f"   Methods: {', '.join(args.methods)}")
+    print(f"   Limit: {args.limit} samples")
+    print(f"   Layer: {args.layer}")
+    print(f"   Strength: {args.strength}\n")
+    # Load task data
+    print("📊 Loading task data...")
+    result = loader._load_one_task(
+        task_name=args.task, split_ratio=0.8, seed=42, limit=args.limit, training_limit=None, testing_limit=None
+    )
+    train_pairs = result["train_qa_pairs"]
+    test_pairs = result["test_qa_pairs"]
+    print(f"   ✓ Loaded {len(train_pairs.pairs)} train, {len(test_pairs.pairs)} test pairs\n")
+    # Initialize evaluator
+    EvaluatorRotator.discover_evaluators("wisent.core.evaluators.benchmark_specific")
+    evaluator = EvaluatorRotator(evaluator=None, task_name=args.task)
+    print(f"   ✓ Using evaluator: {evaluator._plugin.name}\n")
+    # Collect activations once for all methods
+    layer_str = str(args.layer)
+    collector = ActivationCollector(model=model, store_device="cpu")
+    print("🎯 Collecting training activations (ONCE)...")
+    pos_acts = []
+    neg_acts = []
+    for i, pair in enumerate(train_pairs.pairs):
+        if i % 10 == 0:
+            print(f"   Processing train pair {i + 1}/{len(train_pairs.pairs)}...", end="\r")
+        updated_pair = collector.collect_for_pair(
+            pair,
+            layers=[layer_str],
+            aggregation=ActivationAggregationStrategy.MEAN_POOLING,
+            return_full_sequence=False,
+            normalize_layers=False,
+        )
+        if (
+            updated_pair.positive_response.layers_activations
+            and layer_str in updated_pair.positive_response.layers_activations
+        ):
+            act = updated_pair.positive_response.layers_activations[layer_str]
+            if act is not None:
+                pos_acts.append(act)
+        if (
+            updated_pair.negative_response.layers_activations
+            and layer_str in updated_pair.negative_response.layers_activations
+        ):
+            act = updated_pair.negative_response.layers_activations[layer_str]
+            if act is not None:
+                neg_acts.append(act)
+    print(f"   Processing train pair {len(train_pairs.pairs)}/{len(train_pairs.pairs)}... Done!")
+    print(f"   ✓ Collected {len(pos_acts)} positive, {len(neg_acts)} negative activations\n")
+    # Test each method
+    print("🧪 Testing methods...")
+    method_results = {}
+    # Only CAA is implemented for now
+    if "CAA" in args.methods:
+        print("\n   Testing CAA method...")
+        # Train steering vector using selected method
+        method_name = args.methods[0] if args.methods else "CAA"
+        steering_method = create_steering_method(method_name, args)
+        steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
+        # Create steering plan
+        steering_vec = SteeringVector(vector=steering_vector, scale=args.strength)
+        steering_plan = SteeringPlan(
+            layers={layer_str: steering_vec},
+            layers_description=[f"CAA steering layer={args.layer}, strength={args.strength}"],
+        )
+        # Apply steering and evaluate
+        model.apply_steering(steering_plan)
+        test_scores = []
+        detailed_results = []
+        for pair in test_pairs.pairs:
+            choices = [pair.negative_response.model_response, pair.positive_response.model_response]
+            expected = pair.positive_response.model_response
+            test_code = pair.metadata.get("test_code") if pair.metadata else None
+            eval_result = evaluator.evaluate(
+                response="",
+                expected=expected,
+                model=model,
+                question=pair.prompt,
+                choices=choices,
+                steering_plan=steering_plan,
+                test_code=test_code,
+                task_name=args.task,
+            )
+            is_correct = eval_result.ground_truth == "TRUTHFUL"
+            test_scores.append(1.0 if is_correct else 0.0)
+            # Save full evaluation details
+            detailed_results.append(
+                {
+                    "question": pair.prompt,
+                    "choices": choices,
+                    "expected": expected,
+                    "ground_truth": eval_result.ground_truth,
+                    "method_used": eval_result.method_used,
+                    "confidence": eval_result.confidence,
+                    "details": eval_result.details,
+                    "meta": dict(eval_result.meta) if eval_result.meta else {},
+                    "is_correct": is_correct,
+                }
+            )
+        model.clear_steering()
+        caa_accuracy = np.mean(test_scores) if len(test_scores) > 0 else 0.0
+        method_results["CAA"] = {
+            "accuracy": caa_accuracy,
+            "num_test_samples": len(test_scores),
+            "detailed_results": detailed_results,
+        }
+        print(f"      ✓ CAA: accuracy={caa_accuracy:.3f}")
+    # Other methods are not yet implemented
+    for method in args.methods:
+        if method not in ["CAA"]:
+            print(f"      ⚠️  {method}: not yet implemented")
+            method_results[method] = {"accuracy": 0.0, "status": "not_implemented"}
+    # Save results
+    print(f"\n{'=' * 80}")
+    print("📊 METHOD COMPARISON COMPLETE")
+    print(f"{'=' * 80}\n")
+    results_file = f"./optimization_results/steering_compare_methods_{args.task}_{args.model.replace('/', '_')}.json"
+    import os
+    os.makedirs(os.path.dirname(results_file), exist_ok=True)
+    output_data = {
+        "model": args.model,
+        "task": args.task,
+        "layer": args.layer,
+        "strength": args.strength,
+        "methods": method_results,
+        "limit": args.limit,
+    }
+    with open(results_file, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"✅ Results saved to: {results_file}\n")
+    # Create comparison plot if we have results
+    implemented_methods = [m for m in method_results if method_results[m].get("accuracy", 0) > 0]
+    if len(implemented_methods) > 1 and args.save_plot:
+        plot_path_svg = f"steering_compare_methods_{args.task}_{args.model.replace('/', '_')}.svg"
+        plot_path_png = f"steering_compare_methods_{args.task}_{args.model.replace('/', '_')}.png"
+        method_names = list(implemented_methods)
+        accuracies = [method_results[m]["accuracy"] for m in method_names]
+        chart = LineChart(style=1, figsize=(10, 6), show_markers=True)
+        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+        ax.bar(method_names, accuracies, color="#3498db", alpha=0.8)
+        ax.set_xlabel("Steering Method")
+        ax.set_ylabel("Accuracy")
+        ax.set_title(f"Steering Method Comparison\n{args.model} on {args.task}")
+        ax.set_ylim(0, 1)
+        fig.savefig(plot_path_svg, format="svg", bbox_inches="tight")
+        fig.savefig(plot_path_png, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print("💾 Comparison plot saved to:")
+        print(f"   SVG: {plot_path_svg}")
+        print(f"   PNG: {plot_path_png}\n")
+    # Store best result in cache
+    save_as_default = getattr(args, "save_as_default", False)
+    best_method = max(method_results.keys(), key=lambda m: method_results[m].get("accuracy", 0))
+    best_accuracy = method_results[best_method].get("accuracy", 0)
+    if best_accuracy > 0:
+        print("💾 Storing best result in optimization cache...")
+        cache_key = store_optimization(
+            model=args.model,
+            task=args.task,
+            layer=args.layer,
+            strength=args.strength,
+            method=best_method,
+            strategy="constant",
+            score=best_accuracy,
+            metric="accuracy",
+            metadata={"limit": args.limit},
+            set_as_default=save_as_default,
+        )
+        print(f"   ✓ Cached: {cache_key}")
+        if save_as_default:
+            print("   ✓ Set as default configuration")
+    return {"action": "compare-methods", "task": args.task, "methods": method_results, "results_file": results_file}
+def execute_optimize_layer(args, model, loader):
+    """Execute layer optimization - find the best layer for steering."""
+    import matplotlib.pyplot as plt
+    from wisent_plots import LineChart
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    # Check for cached results if --use-cached is specified
+    use_cached = getattr(args, "use_cached", False)
+    save_as_default = getattr(args, "save_as_default", False)
+    if use_cached:
+        print(f"\n📦 Checking optimization cache for {args.task}/{args.method}...")
+        cached = get_cached_optimization(args.model, args.task, args.method)
+        if cached:
+            print(
+                f"   ✓ Found cached result: layer={cached.layer}, strength={cached.strength}, score={cached.score:.3f}"
+            )
+            return {
+                "model": args.model,
+                "action": "optimize-layer",
+                "task": args.task,
+                "method": args.method,
+                "best_layer": cached.layer,
+                "best_strength": cached.strength,
+                "best_accuracy": cached.score,
+                "from_cache": True,
+            }
+        print("   No cached results found. Running optimization...")
+    print(f"🎯 Optimizing steering layer for task: {args.task}\n")
+    print(f"   Method: {args.method}")
+    print(f"   Strength: {args.strength}")
+    print(f"   Limit: {args.limit} samples\n")
+    # Load task data
+    print("📊 Loading task data...")
+    result = loader._load_one_task(
+        task_name=args.task, split_ratio=0.8, seed=42, limit=args.limit, training_limit=None, testing_limit=None
+    )
+    train_pairs = result["train_qa_pairs"]
+    test_pairs = result["test_qa_pairs"]
+    print(f"   ✓ Loaded {len(train_pairs.pairs)} train, {len(test_pairs.pairs)} test pairs\n")
+    # Initialize evaluator
+    EvaluatorRotator.discover_evaluators("wisent.core.evaluators.benchmark_specific")
+    evaluator = EvaluatorRotator(evaluator=None, task_name=args.task)
+    print(f"   ✓ Using evaluator: {evaluator._plugin.name}\n")
+    # Determine layers to test
+    if args.layers:
+        layers_to_test = args.layers
+    else:
+        # Test all layers from 0 to num_layers-1
+        layers_to_test = list(range(model.num_layers))
+    print(f"🔍 Testing {len(layers_to_test)} layers: {layers_to_test[:5]}{'...' if len(layers_to_test) > 5 else ''}\n")
+    collector = ActivationCollector(model=model, store_device="cpu")
+    layer_results = {}
+    best_layer = None
+    best_accuracy = 0.0
+    for layer_idx, layer in enumerate(layers_to_test, 1):
+        layer_str = str(layer)
+        print(f"   [{layer_idx}/{len(layers_to_test)}] Testing layer {layer}...", end=" ")
+        try:
+            # Collect activations for this layer
+            pos_acts = []
+            neg_acts = []
+            for pair in train_pairs.pairs:
+                updated_pair = collector.collect_for_pair(
+                    pair,
+                    layers=[layer_str],
+                    aggregation=ActivationAggregationStrategy.MEAN_POOLING,
+                    return_full_sequence=False,
+                    normalize_layers=False,
+                )
+                if (
+                    updated_pair.positive_response.layers_activations
+                    and layer_str in updated_pair.positive_response.layers_activations
+                ):
+                    act = updated_pair.positive_response.layers_activations[layer_str]
+                    if act is not None:
+                        pos_acts.append(act)
+                if (
+                    updated_pair.negative_response.layers_activations
+                    and layer_str in updated_pair.negative_response.layers_activations
+                ):
+                    act = updated_pair.negative_response.layers_activations[layer_str]
+                    if act is not None:
+                        neg_acts.append(act)
+            if len(pos_acts) == 0 or len(neg_acts) == 0:
+                print("⚠️  No activations collected")
+                continue
+            # Train steering vector using selected method
+            steering_method = create_steering_method(args.method, args)
+            steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
+            if False:  # Compatibility placeholder
+                print(f"⚠️  Method {args.method} not supported")
+                continue
+            # Create steering plan
+            steering_vec = SteeringVector(vector=steering_vector, scale=args.strength)
+            steering_plan = SteeringPlan(
+                layers={layer_str: steering_vec}, layers_description=[f"{args.method} steering layer={layer}"]
+            )
+            # Evaluate
+            model.apply_steering(steering_plan)
+            test_scores = []
+            detailed_results = []
+            for pair in test_pairs.pairs:
+                choices = [pair.negative_response.model_response, pair.positive_response.model_response]
+                expected = pair.positive_response.model_response
+                test_code = pair.metadata.get("test_code") if pair.metadata else None
+                eval_result = evaluator.evaluate(
+                    response="",
+                    expected=expected,
+                    model=model,
+                    question=pair.prompt,
+                    choices=choices,
+                    steering_plan=steering_plan,
+                    test_code=test_code,
+                    task_name=task_name,
+                )
+                is_correct = eval_result.ground_truth == "TRUTHFUL"
+                test_scores.append(1.0 if is_correct else 0.0)
+                # Save full evaluation details
+                detailed_results.append(
+                    {
+                        "question": pair.prompt,
+                        "choices": choices,
+                        "expected": expected,
+                        "ground_truth": eval_result.ground_truth,
+                        "method_used": eval_result.method_used,
+                        "confidence": eval_result.confidence,
+                        "details": eval_result.details,
+                        "meta": dict(eval_result.meta) if eval_result.meta else {},
+                        "is_correct": is_correct,
+                    }
+                )
+            model.clear_steering()
+            accuracy = np.mean(test_scores) if len(test_scores) > 0 else 0.0
+            layer_results[layer] = {
+                "accuracy": accuracy,
+                "num_test_samples": len(test_scores),
+                "detailed_results": detailed_results,
+            }
+            print(f"accuracy={accuracy:.3f}")
+            if accuracy > best_accuracy:
+                best_accuracy = accuracy
+                best_layer = layer
+        except Exception as e:
+            print(f"❌ Error: {e}")
+            if args.verbose:
+                import traceback
+                traceback.print_exc()
+    # Results
+    print(f"\n{'=' * 80}")
+    print("📊 LAYER OPTIMIZATION COMPLETE")
+    print(f"{'=' * 80}")
+    print(f"   Best layer: {best_layer}")
+    print(f"   Best accuracy: {best_accuracy:.4f}")
+    print(f"{'=' * 80}\n")
+    # Save results
+    results_file = f"./optimization_results/steering_optimize_layer_{args.task}_{args.model.replace('/', '_')}.json"
+    import os
+    os.makedirs(os.path.dirname(results_file), exist_ok=True)
+    output_data = {
+        "model": args.model,
+        "task": args.task,
+        "method": args.method,
+        "strength": args.strength,
+        "best_layer": best_layer,
+        "best_accuracy": best_accuracy,
+        "layer_results": {str(k): v for k, v in layer_results.items()},
+        "limit": args.limit,
+    }
+    with open(results_file, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"✅ Results saved to: {results_file}\n")
+    # Create plot
+    if args.save_plot and len(layer_results) > 0:
+        plot_path_svg = f"steering_optimize_layer_{args.task}_{args.model.replace('/', '_')}.svg"
+        plot_path_png = f"steering_optimize_layer_{args.task}_{args.model.replace('/', '_')}.png"
+        layers = sorted(layer_results.keys())
+        accuracies = [layer_results[l]["accuracy"] for l in layers]
+        chart = LineChart(style=1, figsize=(10, 6), show_markers=True)
+        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+        chart.plot_multiple(
+            x=layers,
+            y_series=[accuracies],
+            labels=["Accuracy"],
+            title=f"Layer Optimization\n{args.model} on {args.task}",
+            xlabel="Layer",
+            ylabel="Accuracy",
+            fig=fig,
+            ax=ax,
+            output_format="png",
+        )
+        # Add vertical line for optimal layer
+        ax.axvline(
+            x=best_layer, color="#2ecc71", linestyle="--", linewidth=2, label=f"Best: Layer {best_layer}", alpha=0.7
+        )
+        ax.legend()
+        fig.savefig(plot_path_svg, format="svg", bbox_inches="tight")
+        fig.savefig(plot_path_png, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print("💾 Layer optimization plot saved to:")
+        print(f"   SVG: {plot_path_svg}")
+        print(f"   PNG: {plot_path_png}\n")
+    # Store result in cache
+    save_as_default = getattr(args, "save_as_default", False)
+    if best_layer is not None and best_accuracy > 0:
+        print("💾 Storing result in optimization cache...")
+        cache_key = store_optimization(
+            model=args.model,
+            task=args.task,
+            layer=best_layer,
+            strength=args.strength,
+            method=args.method,
+            strategy="constant",
+            score=best_accuracy,
+            metric="accuracy",
+            metadata={"limit": args.limit},
+            set_as_default=save_as_default,
+        )
+        print(f"   ✓ Cached: {cache_key}")
+        if save_as_default:
+            print("   ✓ Set as default configuration")
+    return {
+        "action": "optimize-layer",
+        "task": args.task,
+        "method": args.method,
+        "best_layer": best_layer,
+        "best_accuracy": best_accuracy,
+        "results_file": results_file,
+    }
+def execute_optimize_strength(args, model, loader):
+    """Execute strength optimization - find the best steering strength."""
+    import matplotlib.pyplot as plt
+    from wisent_plots import LineChart
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    # Check for cached results if --use-cached is specified
+    use_cached = getattr(args, "use_cached", False)
+    save_as_default = getattr(args, "save_as_default", False)
+    if use_cached:
+        print(f"\n📦 Checking optimization cache for {args.task}/{args.method}...")
+        cached = get_cached_optimization(args.model, args.task, args.method)
+        if cached:
+            print(
+                f"   ✓ Found cached result: layer={cached.layer}, strength={cached.strength}, score={cached.score:.3f}"
+            )
+            return {
+                "model": args.model,
+                "action": "optimize-strength",
+                "task": args.task,
+                "method": args.method,
+                "best_layer": cached.layer,
+                "best_strength": cached.strength,
+                "best_accuracy": cached.score,
+                "from_cache": True,
+            }
+        print("   No cached results found. Running optimization...")
+    print(f"💪 Optimizing steering strength for task: {args.task}\n")
+    print(f"   Method: {args.method}")
+    print(f"   Layer: {args.layer}")
+    print(f"   Strength range: {args.strength_range[0]} to {args.strength_range[1]}")
+    print(f"   Num steps: {args.num_strength_steps}")
+    print(f"   Limit: {args.limit} samples\n")
+    # Load task data
+    print("📊 Loading task data...")
+    result = loader._load_one_task(
+        task_name=args.task, split_ratio=0.8, seed=42, limit=args.limit, training_limit=None, testing_limit=None
+    )
+    train_pairs = result["train_qa_pairs"]
+    test_pairs = result["test_qa_pairs"]
+    print(f"   ✓ Loaded {len(train_pairs.pairs)} train, {len(test_pairs.pairs)} test pairs\n")
+    # Initialize evaluator
+    EvaluatorRotator.discover_evaluators("wisent.core.evaluators.benchmark_specific")
+    evaluator = EvaluatorRotator(evaluator=None, task_name=args.task)
+    print(f"   ✓ Using evaluator: {evaluator._plugin.name}\n")
+    # Collect activations ONCE
+    layer_str = str(args.layer)
+    collector = ActivationCollector(model=model, store_device="cpu")
+    print("🎯 Collecting training activations (ONCE)...")
+    pos_acts = []
+    neg_acts = []
+    for i, pair in enumerate(train_pairs.pairs):
+        if i % 10 == 0:
+            print(f"   Processing train pair {i + 1}/{len(train_pairs.pairs)}...", end="\r")
+        updated_pair = collector.collect_for_pair(
+            pair,
+            layers=[layer_str],
+            aggregation=ActivationAggregationStrategy.MEAN_POOLING,
+            return_full_sequence=False,
+            normalize_layers=False,
+        )
+        if (
+            updated_pair.positive_response.layers_activations
+            and layer_str in updated_pair.positive_response.layers_activations
+        ):
+            act = updated_pair.positive_response.layers_activations[layer_str]
+            if act is not None:
+                pos_acts.append(act)
+        if (
+            updated_pair.negative_response.layers_activations
+            and layer_str in updated_pair.negative_response.layers_activations
+        ):
+            act = updated_pair.negative_response.layers_activations[layer_str]
+            if act is not None:
+                neg_acts.append(act)
+    print(f"   Processing train pair {len(train_pairs.pairs)}/{len(train_pairs.pairs)}... Done!")
+    print(f"   ✓ Collected {len(pos_acts)} positive, {len(neg_acts)} negative activations\n")
+    # Train steering vector using selected method
+    steering_method = create_steering_method(args.method, args)
+    steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
+    if False:  # Compatibility placeholder
+        print(f"❌ Method {args.method} not supported")
+        return {
+            "action": "optimize-strength",
+            "task": args.task,
+            "method": args.method,
+            "status": "method_not_supported",
+        }
+    # Generate strength values to test
+    min_strength, max_strength = args.strength_range
+    strengths_to_test = np.linspace(min_strength, max_strength, args.num_strength_steps)
+    print(
+        f"🔍 Testing {len(strengths_to_test)} strength values: {strengths_to_test[0]:.2f} to {strengths_to_test[-1]:.2f}\n"
+    )
+    strength_results = {}
+    best_strength = None
+    best_accuracy = 0.0
+    for strength_idx, strength in enumerate(strengths_to_test, 1):
+        print(f"   [{strength_idx}/{len(strengths_to_test)}] Testing strength {strength:.2f}...", end=" ")
+        try:
+            # Create steering plan with this strength
+            steering_vec = SteeringVector(vector=steering_vector, scale=float(strength))
+            steering_plan = SteeringPlan(
+                layers={layer_str: steering_vec}, layers_description=[f"{args.method} steering strength={strength:.2f}"]
+            )
+            # Evaluate
+            model.apply_steering(steering_plan)
+            test_scores = []
+            detailed_results = []
+            for pair in test_pairs.pairs:
+                choices = [pair.negative_response.model_response, pair.positive_response.model_response]
+                expected = pair.positive_response.model_response
+                test_code = pair.metadata.get("test_code") if pair.metadata else None
+                eval_result = evaluator.evaluate(
+                    response="",
+                    expected=expected,
+                    model=model,
+                    question=pair.prompt,
+                    choices=choices,
+                    steering_plan=steering_plan,
+                    test_code=test_code,
+                    task_name=task_name,
+                )
+                is_correct = eval_result.ground_truth == "TRUTHFUL"
+                test_scores.append(1.0 if is_correct else 0.0)
+                # Save full evaluation details
+                detailed_results.append(
+                    {
+                        "question": pair.prompt,
+                        "choices": choices,
+                        "expected": expected,
+                        "ground_truth": eval_result.ground_truth,
+                        "method_used": eval_result.method_used,
+                        "confidence": eval_result.confidence,
+                        "details": eval_result.details,
+                        "meta": dict(eval_result.meta) if eval_result.meta else {},
+                        "is_correct": is_correct,
+                    }
+                )
+            model.clear_steering()
+            accuracy = np.mean(test_scores) if len(test_scores) > 0 else 0.0
+            strength_results[float(strength)] = {
+                "accuracy": accuracy,
+                "num_test_samples": len(test_scores),
+                "detailed_results": detailed_results,
+            }
+            print(f"accuracy={accuracy:.3f}")
+            if accuracy > best_accuracy:
+                best_accuracy = accuracy
+                best_strength = float(strength)
+        except Exception as e:
+            print(f"❌ Error: {e}")
+            if args.verbose:
+                import traceback
+                traceback.print_exc()
+    # Results
+    print(f"\n{'=' * 80}")
+    print("📊 STRENGTH OPTIMIZATION COMPLETE")
+    print(f"{'=' * 80}")
+    print(f"   Best strength: {best_strength:.2f}")
+    print(f"   Best accuracy: {best_accuracy:.4f}")
+    print(f"{'=' * 80}\n")
+    # Save results
+    results_file = f"./optimization_results/steering_optimize_strength_{args.task}_{args.model.replace('/', '_')}.json"
+    import os
+    os.makedirs(os.path.dirname(results_file), exist_ok=True)
+    output_data = {
+        "model": args.model,
+        "task": args.task,
+        "method": args.method,
+        "layer": args.layer,
+        "best_strength": best_strength,
+        "best_accuracy": best_accuracy,
+        "strength_results": {str(k): v for k, v in strength_results.items()},
+        "limit": args.limit,
+    }
+    with open(results_file, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"✅ Results saved to: {results_file}\n")
+    # Create plot
+    if args.save_plot and len(strength_results) > 0:
+        plot_path_svg = f"steering_optimize_strength_{args.task}_{args.model.replace('/', '_')}.svg"
+        plot_path_png = f"steering_optimize_strength_{args.task}_{args.model.replace('/', '_')}.png"
+        strengths = sorted(strength_results.keys())
+        accuracies = [strength_results[s]["accuracy"] for s in strengths]
+        chart = LineChart(style=1, figsize=(10, 6), show_markers=True)
+        fig, ax = plt.subplots(1, 1, figsize=(10, 6))
+        chart.plot_multiple(
+            x=strengths,
+            y_series=[accuracies],
+            labels=["Accuracy"],
+            title=f"Strength Optimization\n{args.model} on {args.task}",
+            xlabel="Steering Strength",
+            ylabel="Accuracy",
+            fig=fig,
+            ax=ax,
+            output_format="png",
+        )
+        # Add vertical line for optimal strength
+        ax.axvline(
+            x=best_strength, color="#2ecc71", linestyle="--", linewidth=2, label=f"Best: {best_strength:.2f}", alpha=0.7
+        )
+        ax.legend()
+        fig.savefig(plot_path_svg, format="svg", bbox_inches="tight")
+        fig.savefig(plot_path_png, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print("💾 Strength optimization plot saved to:")
+        print(f"   SVG: {plot_path_svg}")
+        print(f"   PNG: {plot_path_png}\n")
+    # Store result in cache
+    save_as_default = getattr(args, "save_as_default", False)
+    if best_strength is not None and best_accuracy > 0:
+        print("💾 Storing result in optimization cache...")
+        cache_key = store_optimization(
+            model=args.model,
+            task=args.task,
+            layer=args.layer,
+            strength=best_strength,
+            method=args.method,
+            strategy="constant",
+            score=best_accuracy,
+            metric="accuracy",
+            metadata={"limit": args.limit, "strength_range": args.strength_range},
+            set_as_default=save_as_default,
+        )
+        print(f"   ✓ Cached: {cache_key}")
+        if save_as_default:
+            print("   ✓ Set as default configuration")
+    return {
+        "action": "optimize-strength",
+        "task": args.task,
+        "method": args.method,
+        "best_strength": best_strength,
+        "best_accuracy": best_accuracy,
+        "results_file": results_file,
+    }
+def execute_auto(args, model, loader):
+    """Execute automatic optimization - optimizes layer AND strength together."""
+    import matplotlib.pyplot as plt
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    # Check for cached results if --use-cached is specified
+    use_cached = getattr(args, "use_cached", False)
+    save_as_default = getattr(args, "save_as_default", False)
+    task_name = args.task or "default"
+    if use_cached:
+        print(f"\n📦 Checking optimization cache for {task_name}...")
+        for method in args.methods:
+            cached = get_cached_optimization(args.model, task_name, method)
+            if cached:
+                print(
+                    f"   ✓ Found cached result for {method}: layer={cached.layer}, strength={cached.strength}, score={cached.score:.3f}"
+                )
+                return {
+                    "model": args.model,
+                    "action": "auto",
+                    "task": task_name,
+                    "best_method": method,
+                    "best_layer": cached.layer,
+                    "best_strength": cached.strength,
+                    "best_accuracy": cached.score,
+                    "from_cache": True,
+                }
+        print("   No cached results found. Running optimization...")
+    print("🤖 Running automatic steering optimization...\n")
+    print(f"   Task: {args.task}")
+    print(f"   Methods: {', '.join(args.methods)}")
+    print(f"   Strength range: {args.strength_range}")
+    print(f"   Limit: {args.limit} samples\n")
+    # Load task data
+    print("📊 Loading task data...")
+    result = loader._load_one_task(
+        task_name=args.task, split_ratio=0.8, seed=42, limit=args.limit, training_limit=None, testing_limit=None
+    )
+    train_pairs = result["train_qa_pairs"]
+    test_pairs = result["test_qa_pairs"]
+    print(f"   ✓ Loaded {len(train_pairs.pairs)} train, {len(test_pairs.pairs)} test pairs\n")
+    # Initialize evaluator
+    EvaluatorRotator.discover_evaluators("wisent.core.evaluators.benchmark_specific")
+    evaluator = EvaluatorRotator(evaluator=None, task_name=args.task)
+    print(f"   ✓ Using evaluator: {evaluator._plugin.name}\n")
+    # Define search space
+    layers_to_test = list(
+        range(max(0, model.num_layers // 2 - 2), min(model.num_layers, model.num_layers // 2 + 3))
+    )  # Test 5 layers around middle
+    min_strength, max_strength = args.strength_range
+    strengths_to_test = np.linspace(min_strength, max_strength, 5)  # 5 strength values
+    print("🔍 Auto-optimizing layer and strength...")
+    print(f"   Testing {len(layers_to_test)} layers: {layers_to_test}")
+    print(f"   Testing {len(strengths_to_test)} strengths: {strengths_to_test[0]:.2f} to {strengths_to_test[-1]:.2f}")
+    print(f"   Total configurations: {len(layers_to_test) * len(strengths_to_test)}\n")
+    collector = ActivationCollector(model=model, store_device="cpu")
+    all_results = {}
+    best_config = None
+    best_accuracy = 0.0
+    config_count = 0
+    total_configs = len(layers_to_test) * len(strengths_to_test)
+    for layer in layers_to_test:
+        layer_str = str(layer)
+        # Collect activations for this layer
+        print(f"   Collecting activations for layer {layer}...")
+        pos_acts = []
+        neg_acts = []
+        for pair in train_pairs.pairs:
+            updated_pair = collector.collect_for_pair(
+                pair,
+                layers=[layer_str],
+                aggregation=ActivationAggregationStrategy.MEAN_POOLING,
+                return_full_sequence=False,
+                normalize_layers=False,
+            )
+            if (
+                updated_pair.positive_response.layers_activations
+                and layer_str in updated_pair.positive_response.layers_activations
+            ):
+                act = updated_pair.positive_response.layers_activations[layer_str]
+                if act is not None:
+                    pos_acts.append(act)
+            if (
+                updated_pair.negative_response.layers_activations
+                and layer_str in updated_pair.negative_response.layers_activations
+            ):
+                act = updated_pair.negative_response.layers_activations[layer_str]
+                if act is not None:
+                    neg_acts.append(act)
+        if len(pos_acts) == 0 or len(neg_acts) == 0:
+            print(f"      ⚠️  No activations collected for layer {layer}")
+            continue
+        # Train steering vector for this layer using selected method
+        method_name = args.methods[0] if args.methods else "CAA"
+        steering_method = create_steering_method(method_name, args)
+        steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
+        if False:  # Compatibility placeholder
+            print("      ⚠️  Only CAA method is supported")
+            continue
+        # Test different strengths for this layer
+        for strength in strengths_to_test:
+            config_count += 1
+            print(f"      [{config_count}/{total_configs}] Layer {layer}, Strength {strength:.2f}...", end=" ")
+            try:
+                # Create steering plan
+                steering_vec = SteeringVector(vector=steering_vector, scale=float(strength))
+                steering_plan = SteeringPlan(
+                    layers={layer_str: steering_vec}, layers_description=[f"CAA layer={layer}, strength={strength:.2f}"]
+                )
+                # Evaluate
+                model.apply_steering(steering_plan)
+                test_scores = []
+                detailed_results = []
+                for pair in test_pairs.pairs:
+                    choices = [pair.negative_response.model_response, pair.positive_response.model_response]
+                    expected = pair.positive_response.model_response
+                    test_code = pair.metadata.get("test_code") if pair.metadata else None
+                    eval_result = evaluator.evaluate(
+                        response="",
+                        expected=expected,
+                        model=model,
+                        question=pair.prompt,
+                        choices=choices,
+                        steering_plan=steering_plan,
+                        test_code=test_code,
+                        task_name=task_name,
+                    )
+                    is_correct = eval_result.ground_truth == "TRUTHFUL"
+                    test_scores.append(1.0 if is_correct else 0.0)
+                    # Save full evaluation details
+                    detailed_results.append(
+                        {
+                            "question": pair.prompt,
+                            "choices": choices,
+                            "expected": expected,
+                            "ground_truth": eval_result.ground_truth,
+                            "method_used": eval_result.method_used,
+                            "confidence": eval_result.confidence,
+                            "details": eval_result.details,
+                            "meta": dict(eval_result.meta) if eval_result.meta else {},
+                            "is_correct": is_correct,
+                        }
+                    )
+                model.clear_steering()
+                accuracy = np.mean(test_scores) if len(test_scores) > 0 else 0.0
+                all_results[(layer, float(strength))] = {
+                    "accuracy": accuracy,
+                    "num_test_samples": len(test_scores),
+                    "detailed_results": detailed_results,
+                }
+                print(f"accuracy={accuracy:.3f}")
+                if accuracy > best_accuracy:
+                    best_accuracy = accuracy
+                    best_config = {"layer": layer, "strength": float(strength), "accuracy": accuracy}
+            except Exception as e:
+                print(f"❌ Error: {e}")
+                if args.verbose:
+                    import traceback
+                    traceback.print_exc()
+    # Results
+    print(f"\n{'=' * 80}")
+    print("📊 AUTO OPTIMIZATION COMPLETE")
+    print(f"{'=' * 80}")
+    if best_config:
+        print(f"   Best layer: {best_config['layer']}")
+        print(f"   Best strength: {best_config['strength']:.2f}")
+        print(f"   Best accuracy: {best_config['accuracy']:.4f}")
+    else:
+        print("   ⚠️  No valid configuration found")
+    print(f"{'=' * 80}\n")
+    # Save results
+    results_file = f"./optimization_results/steering_auto_{args.task}_{args.model.replace('/', '_')}.json"
+    import os
+    os.makedirs(os.path.dirname(results_file), exist_ok=True)
+    output_data = {
+        "model": args.model,
+        "task": args.task,
+        "methods": args.methods,
+        "best_config": best_config,
+        "all_results": {f"layer{k[0]}_strength{k[1]:.2f}": v for k, v in all_results.items()},
+        "limit": args.limit,
+    }
+    with open(results_file, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"✅ Results saved to: {results_file}\n")
+    # Create heatmap plot
+    if args.save_plot and len(all_results) > 0 and best_config:
+        plot_path_svg = f"steering_auto_{args.task}_{args.model.replace('/', '_')}.svg"
+        plot_path_png = f"steering_auto_{args.task}_{args.model.replace('/', '_')}.png"
+        # Prepare data for heatmap
+        layers = sorted(set(k[0] for k in all_results))
+        strengths = sorted(set(k[1] for k in all_results))
+        # Create accuracy matrix
+        accuracy_matrix = np.zeros((len(strengths), len(layers)))
+        for i, strength in enumerate(strengths):
+            for j, layer in enumerate(layers):
+                if (layer, strength) in all_results:
+                    accuracy_matrix[i, j] = all_results[(layer, strength)]["accuracy"]
+        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
+        im = ax.imshow(accuracy_matrix, cmap="viridis", aspect="auto")
+        # Set ticks and labels
+        ax.set_xticks(np.arange(len(layers)))
+        ax.set_yticks(np.arange(len(strengths)))
+        ax.set_xticklabels(layers)
+        ax.set_yticklabels([f"{s:.2f}" for s in strengths])
+        # Labels
+        ax.set_xlabel("Layer")
+        ax.set_ylabel("Strength")
+        ax.set_title(f"Auto Optimization Heatmap\n{args.model} on {args.task}")
+        # Colorbar
+        cbar = plt.colorbar(im, ax=ax)
+        cbar.set_label("Accuracy", rotation=270, labelpad=15)
+        # Mark best configuration
+        best_layer_idx = layers.index(best_config["layer"])
+        best_strength_idx = strengths.index(best_config["strength"])
+        ax.plot(
+            best_layer_idx,
+            best_strength_idx,
+            "r*",
+            markersize=20,
+            label=f"Best: L{best_config['layer']}, S{best_config['strength']:.2f}",
+        )
+        ax.legend()
+        fig.savefig(plot_path_svg, format="svg", bbox_inches="tight")
+        fig.savefig(plot_path_png, dpi=150, bbox_inches="tight")
+        plt.close(fig)
+        print("💾 Auto optimization heatmap saved to:")
+        print(f"   SVG: {plot_path_svg}")
+        print(f"   PNG: {plot_path_png}\n")
+    # Store result in cache
+    save_as_default = getattr(args, "save_as_default", False)
+    if best_config and best_config.get("accuracy", 0) > 0:
+        print("💾 Storing result in optimization cache...")
+        cache_key = store_optimization(
+            model=args.model,
+            task=args.task or "auto",
+            layer=best_config["layer"],
+            strength=best_config["strength"],
+            method=best_config.get("method", "CAA"),
+            strategy=best_config.get("strategy", "constant"),
+            score=best_config["accuracy"],
+            metric="accuracy",
+            metadata={"limit": args.limit, "strength_range": list(args.strength_range)},
+            set_as_default=save_as_default,
+        )
+        print(f"   ✓ Cached: {cache_key}")
+        if save_as_default:
+            print("   ✓ Set as default configuration")
+    return {
+        "action": "auto",
+        "task": args.task,
+        "methods": args.methods,
+        "best_config": best_config,
+        "results_file": results_file,
+    }
+def execute_personalization(args, model):
+    """
+    Execute personalization optimization - find optimal parameters for trait steering.
+    This optimizes ALL steering parameters for personality/trait vectors by:
+    1. Generating synthetic contrastive pairs for the trait
+    2. Testing all combinations of:
+       - Layers (where to apply steering)
+       - Strengths (how strong the steering signal is)
+       - Token aggregation strategies (LAST_TOKEN, MEAN_POOLING, FIRST_TOKEN)
+       - Prompt construction strategies (CHAT_TEMPLATE, DIRECT_COMPLETION)
+    3. Evaluating each configuration using personalization metrics:
+       - Difference: Is the steered response different from baseline?
+       - Quality: Is the response coherent (not lobotomized)?
+       - Alignment: Does the response match the intended trait?
+    4. Selecting the configuration with the highest overall score
+    """
+    import os
+    import torch
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
+    from wisent.core.evaluators.steering_evaluators import PersonalizationEvaluator
+    from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    from wisent.core.synthetic.cleaners.pairs_cleaner import PairsCleaner
+    from wisent.core.synthetic.db_instructions.mini_dp import Default_DB_Instructions
+    from wisent.core.synthetic.generators.diversities.methods.fast_diversity import FastDiversity
+    from wisent.core.synthetic.generators.pairs_generator import SyntheticContrastivePairsGenerator
+    trait = args.trait
+    trait_name = args.trait_name or trait.split()[0].lower()
+    print(f"\n{'=' * 80}", flush=True)
+    print("🎭 PERSONALIZATION OPTIMIZATION (COMPREHENSIVE)", flush=True)
+    print(f"{'=' * 80}", flush=True)
+    print(f"   Trait: {trait}", flush=True)
+    print(f"   Trait Name: {trait_name}", flush=True)
+    print(f"   Model: {args.model}", flush=True)
+    print(f"   Num Pairs: {args.num_pairs}", flush=True)
+    print(f"   Num Test Prompts: {args.num_test_prompts}", flush=True)
+    print(f"   Output Directory: {args.output_dir}", flush=True)
+    print(f"{'=' * 80}\n", flush=True)
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "vectors"), exist_ok=True)
+    # Determine layers to test - ALL layers by default
+    if args.layers:
+        layers_to_test = args.layers
+    else:
+        # Test ALL layers (1-indexed, since activation collector uses 1-based indexing)
+        num_layers = model.num_layers
+        layers_to_test = list(range(1, num_layers + 1))
+    # Determine strengths to test
+    min_strength, max_strength = args.strength_range
+    strengths_to_test = np.linspace(min_strength, max_strength, args.num_strength_steps)
+    # Token aggregation strategies to test - ALL strategies
+    token_aggregations_to_test = [
+        ActivationAggregationStrategy.LAST_TOKEN,
+        ActivationAggregationStrategy.MEAN_POOLING,
+        ActivationAggregationStrategy.FIRST_TOKEN,
+        ActivationAggregationStrategy.MAX_POOLING,
+    ]
+    # Prompt construction strategies to test - ALL strategies
+    prompt_constructions_to_test = [
+        PromptConstructionStrategy.CHAT_TEMPLATE,
+        PromptConstructionStrategy.DIRECT_COMPLETION,
+        PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
+        PromptConstructionStrategy.ROLE_PLAYING,
+        PromptConstructionStrategy.MULTIPLE_CHOICE,
+    ]
+    # Steering application strategies to test - ALL strategies
+    steering_strategies_to_test = ["constant", "initial_only", "diminishing", "all_equal"]
+    total_configs = (
+        len(layers_to_test)
+        * len(strengths_to_test)
+        * len(steering_strategies_to_test)
+        * len(token_aggregations_to_test)
+        * len(prompt_constructions_to_test)
+    )
+    print("📊 Search Space:", flush=True)
+    print(f"   Layers: {layers_to_test} ({len(layers_to_test)} total)", flush=True)
+    print(f"   Strengths: {[f'{s:.2f}' for s in strengths_to_test]}", flush=True)
+    print(f"   Steering Strategies: {steering_strategies_to_test}", flush=True)
+    print(f"   Token Aggregations: {[t.value for t in token_aggregations_to_test]}", flush=True)
+    print(f"   Prompt Constructions: {[p.value for p in prompt_constructions_to_test]}", flush=True)
+    print(f"   Total configurations: {total_configs}\n", flush=True)
+    # Step 1: Generate synthetic contrastive pairs
+    print(f"🔧 Step 1: Generating {args.num_pairs} synthetic contrastive pairs...", flush=True)
+    # Create the pair generator with required components
+    cleaner = PairsCleaner(steps=[])  # Minimal cleaning
+    diversity = FastDiversity()
+    db_instructions = Default_DB_Instructions()
+    pair_generator = SyntheticContrastivePairsGenerator(
+        model=model,
+        generation_config=get_generate_kwargs(max_new_tokens=150),
+        contrastive_set_name=f"{trait_name}_pairs",
+        trait_description=trait,
+        trait_label=trait_name,
+        db_instructions=db_instructions,
+        cleaner=cleaner,
+        diversity=diversity,
+    )
+    pair_set, generation_report = pair_generator.generate(num_pairs=args.num_pairs)
+    pairs = pair_set.pairs
+    # Extract positive and negative examples for alignment evaluation
+    positive_examples = [p.positive_response.model_response for p in pairs]
+    negative_examples = [p.negative_response.model_response for p in pairs]
+    print(f"   ✓ Generated {len(pairs)} contrastive pairs\n", flush=True)
+    # Generate test prompts for evaluation
+    test_prompts = [
+        "What's your favorite food?",
+        "How do you spend your weekends?",
+        "What motivates you in life?",
+        "How do you handle setbacks?",
+        "What's your opinion on teamwork?",
+    ][: args.num_test_prompts]
+    print("📝 Test prompts for evaluation:", flush=True)
+    for i, prompt in enumerate(test_prompts, 1):
+        print(f"   {i}. {prompt}", flush=True)
+    print(flush=True)
+    # Initialize activation collector
+    collector = ActivationCollector(model=model, store_device="cpu")
+    # Track results for all configurations
+    all_results = {}
+    best_config = None
+    best_score = -1.0
+    best_steering_vector = None
+    # Cache for steering vectors per (layer, token_agg, prompt_const) combination
+    # to avoid recomputing activations unnecessarily
+    steering_vector_cache = {}
+    # Checkpoint file for resuming interrupted runs
+    checkpoint_file = os.path.join(args.output_dir, f"{trait_name}_checkpoint.json")
+    completed_configs = set()
+    # Load checkpoint if it exists (resume mode) - check local first, then S3
+    if not os.path.exists(checkpoint_file):
+        # Try to download from S3
+        try:
+            import subprocess
+            s3_checkpoint_path = f"s3://wisent-bucket/checkpoints/{trait_name}_checkpoint.json"
+            print(f"\n📂 Checking S3 for checkpoint: {s3_checkpoint_path}", flush=True)
+            result = subprocess.run(
+                ["aws", "s3", "cp", s3_checkpoint_path, checkpoint_file],
+                capture_output=True,
+                timeout=60
+            )
+            if result.returncode == 0:
+                print(f"   ✓ Downloaded checkpoint from S3", flush=True)
+        except Exception:
+            pass  # No S3 checkpoint available
+    if os.path.exists(checkpoint_file):
+        print(f"\n📂 Found checkpoint file: {checkpoint_file}", flush=True)
+        try:
+            with open(checkpoint_file, "r") as f:
+                checkpoint_data = json.load(f)
+            all_results = checkpoint_data.get("all_results", {})
+            completed_configs = set(all_results.keys())
+            best_config = checkpoint_data.get("best_config")
+            best_score = checkpoint_data.get("best_score", -1.0)
+            print(f"   ✓ Loaded {len(completed_configs)} completed configurations", flush=True)
+            print(f"   ✓ Current best score: {best_score:.4f}", flush=True)
+            if best_config:
+                print(f"   ✓ Current best config: L{best_config['layer']} S{best_config['strength']:.2f}", flush=True)
+        except Exception as e:
+            print(f"   ⚠️ Failed to load checkpoint: {e}", flush=True)
+            completed_configs = set()
+    # Step 2: Test all configurations
+    print(f"\n🎯 Step 2: Testing {total_configs} configurations...", flush=True)
+    if completed_configs:
+        print(f"   ℹ️  Resuming from checkpoint - {len(completed_configs)} already done, {total_configs - len(completed_configs)} remaining", flush=True)
+    config_count = 0
+    # Initialize file for saving generation examples if requested
+    examples_file_path = None
+    if args.save_all_generation_examples:
+        os.makedirs(args.output_dir, exist_ok=True)
+        examples_file_path = os.path.join(args.output_dir, f"{trait_name}_all_generation_examples.jsonl")
+        # Write header line with metadata
+        with open(examples_file_path, "w") as f:
+            f.write(json.dumps({"_header": True, "trait": trait, "trait_name": trait_name, "model": args.model}) + "\n")
+        print(f"   📝 Will save generation examples to: {examples_file_path}", flush=True)
+    # Pre-generate baseline responses ONCE (they don't depend on any loop variables)
+    print("   📊 Pre-generating baseline responses for test prompts...", flush=True)
+    baseline_responses_cache = {}
+    for prompt in test_prompts:
+        baseline = model.generate(
+            [[{"role": "user", "content": prompt}]],
+            **get_generate_kwargs(max_new_tokens=args.max_new_tokens),
+            use_steering=False,
+        )[0]
+        baseline_responses_cache[prompt] = baseline
+    print(f"   ✓ Generated {len(baseline_responses_cache)} baseline responses", flush=True)
+    for token_agg in token_aggregations_to_test:
+        for prompt_const in prompt_constructions_to_test:
+            print(
+                f"\n   📊 Token Aggregation: {token_agg.value}, Prompt Construction: {prompt_const.value}", flush=True
+            )
+            for layer in layers_to_test:
+                layer_str = str(layer)
+                # Check if we already have activations for this (layer, token_agg) combo
+                cache_key = (layer, token_agg.value, prompt_const.value)
+                if cache_key not in steering_vector_cache:
+                    print(f"\n      📍 Layer {layer}: Collecting activations...", flush=True)
+                    # Collect activations for this layer with current token_agg and prompt_const
+                    pos_acts = []
+                    neg_acts = []
+                    for pair in pairs:
+                        updated_pair = collector.collect_for_pair(
+                            pair,
+                            layers=[layer_str],
+                            aggregation=token_agg,
+                            prompt_strategy=prompt_const,
+                            return_full_sequence=False,
+                            normalize_layers=False,
+                        )
+                        if (
+                            updated_pair.positive_response.layers_activations
+                            and layer_str in updated_pair.positive_response.layers_activations
+                        ):
+                            act = updated_pair.positive_response.layers_activations[layer_str]
+                            if act is not None:
+                                pos_acts.append(act)
+                        if (
+                            updated_pair.negative_response.layers_activations
+                            and layer_str in updated_pair.negative_response.layers_activations
+                        ):
+                            act = updated_pair.negative_response.layers_activations[layer_str]
+                            if act is not None:
+                                neg_acts.append(act)
+                    if len(pos_acts) == 0 or len(neg_acts) == 0:
+                        print(f"         ⚠️ No activations collected for layer {layer}", flush=True)
+                        steering_vector_cache[cache_key] = None
+                        continue
+                    print(
+                        f"         ✓ Collected {len(pos_acts)} positive, {len(neg_acts)} negative activations",
+                        flush=True,
+                    )
+                    # Create steering vector using selected method
+                    steering_method = create_steering_method("CAA", args)
+                    steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
+                    steering_vector_cache[cache_key] = steering_vector
+                    print(
+                        f"         ✓ Created steering vector (norm: {torch.norm(steering_vector).item():.4f})",
+                        flush=True,
+                    )
+                else:
+                    steering_vector = steering_vector_cache[cache_key]
+                    if steering_vector is None:
+                        continue
+                # Test different strengths and steering strategies
+                for strength in strengths_to_test:
+                    for steering_strategy in steering_strategies_to_test:
+                        config_count += 1
+                        config_key = f"L{layer}_S{strength:.2f}_St:{steering_strategy}_T:{token_agg.value}_P:{prompt_const.value}"
+                        # Skip if already completed (checkpoint resume)
+                        if config_key in completed_configs:
+                            print(f"      [{config_count}/{total_configs}] Skipping {config_key} (already done)", flush=True)
+                            continue
+                        config_desc = f"L{layer} S{strength:.2f} St:{steering_strategy} T:{token_agg.value} P:{prompt_const.value}"
+                        print(f"      [{config_count}/{total_configs}] Testing {config_desc}...", end=" ")
+                        # Create steering plan
+                        steering_vec = SteeringVector(vector=steering_vector, scale=float(strength))
+                        steering_plan = SteeringPlan(
+                            layers={layer_str: steering_vec}, layers_description=[f"Personalization {config_desc}"]
+                        )
+                        # Get baseline from cache and generate steered responses
+                        baseline_responses = [baseline_responses_cache[prompt] for prompt in test_prompts]
+                        steered_responses = []
+                        for prompt in test_prompts:
+                            # Generate steered response
+                            model.apply_steering(steering_plan)
+                            steered = model.generate(
+                                [[{"role": "user", "content": prompt}]],
+                                **get_generate_kwargs(max_new_tokens=args.max_new_tokens),
+                                use_steering=True,
+                                steering_plan=steering_plan,
+                            )[0]
+                            model.clear_steering()
+                            steered_responses.append(steered)
+                        # Evaluate using personalization metrics (static methods)
+                        # Calculate difference score
+                        difference_score = PersonalizationEvaluator._evaluate_difference(baseline_responses, steered_responses)
+                        # Calculate quality score
+                        quality_score = PersonalizationEvaluator._evaluate_quality(steered_responses)
+                        # Calculate alignment score using contrastive examples
+                        alignment_score = PersonalizationEvaluator.estimate_alignment(
+                            steered_responses, trait, positive_examples, negative_examples
+                        )
+                        # Calculate overall score (weighted average)
+                        # Only count if difference > 0.3 (steering is actually doing something)
+                        if difference_score < 0.3:
+                            overall_score = 0.0
+                        else:
+                            overall_score = 0.2 * difference_score + 0.3 * quality_score + 0.5 * alignment_score
+                        print(
+                            f"diff={difference_score:.2f} qual={quality_score:.2f} align={alignment_score:.2f} overall={overall_score:.2f}"
+                        )
+                        # Store results with full config key (config_key already defined above)
+                        all_results[config_key] = {
+                            "layer": layer,
+                            "strength": float(strength),
+                            "steering_strategy": steering_strategy,
+                            "token_aggregation": token_agg.value,
+                            "prompt_construction": prompt_const.value,
+                            "difference_score": float(difference_score),
+                            "quality_score": float(quality_score),
+                            "alignment_score": float(alignment_score),
+                            "overall_score": float(overall_score),
+                            "sample_baseline": baseline_responses[0][:200] if baseline_responses else "",
+                            "sample_steered": steered_responses[0][:200] if steered_responses else "",
+                        }
+                        # Save generation examples if requested
+                        if args.save_all_generation_examples and examples_file_path:
+                            example_record = {
+                                "layer": layer,
+                                "strength": float(strength),
+                                "steering_strategy": steering_strategy,
+                                "token_aggregation": token_agg.value,
+                                "prompt_construction": prompt_const.value,
+                                "overall_score": float(overall_score),
+                                "difference_score": float(difference_score),
+                                "quality_score": float(quality_score),
+                                "alignment_score": float(alignment_score),
+                                "examples": [
+                                    {
+                                        "prompt": test_prompts[i],
+                                        "baseline_response": baseline_responses[i],
+                                        "steered_response": steered_responses[i],
+                                    }
+                                    for i in range(len(test_prompts))
+                                ],
+                            }
+                            with open(examples_file_path, "a") as f:
+                                f.write(json.dumps(example_record) + "\n")
+                        # Track best configuration
+                        if overall_score > best_score:
+                            best_score = overall_score
+                            best_config = {
+                                "layer": layer,
+                                "strength": float(strength),
+                                "steering_strategy": steering_strategy,
+                                "token_aggregation": token_agg.value,
+                                "prompt_construction": prompt_const.value,
+                                "difference_score": float(difference_score),
+                                "quality_score": float(quality_score),
+                                "alignment_score": float(alignment_score),
+                                "overall_score": float(overall_score),
+                            }
+                            best_steering_vector = steering_vector
+                            print(f"         🏆 New best! L{layer} S{strength:.2f} score={overall_score:.4f}", flush=True)
+                        # Save checkpoint after each configuration (for resume capability)
+                        checkpoint_data = {
+                            "all_results": all_results,
+                            "best_config": best_config,
+                            "best_score": best_score,
+                            "config_count": config_count,
+                            "total_configs": total_configs,
+                            "trait": trait,
+                            "trait_name": trait_name,
+                            "model": args.model,
+                        }
+                        with open(checkpoint_file, "w") as f:
+                            json.dump(checkpoint_data, f)
+                        # Sync checkpoint to S3 every 100 configs for recovery
+                        if config_count % 100 == 0:
+                            try:
+                                import subprocess
+                                s3_checkpoint_path = f"s3://wisent-bucket/checkpoints/{trait_name}_checkpoint.json"
+                                subprocess.run(
+                                    ["aws", "s3", "cp", checkpoint_file, s3_checkpoint_path],
+                                    capture_output=True,
+                                    timeout=30
+                                )
+                            except Exception:
+                                pass  # Don't fail if S3 sync fails
+    # Step 3: Save results
+    print(f"\n{'=' * 80}")
+    print("📊 OPTIMIZATION COMPLETE")
+    print(f"{'=' * 80}")
+    vector_path = None
+    if best_config:
+        print("\n✅ Best Configuration:")
+        print(f"   Layer: {best_config['layer']}")
+        print(f"   Strength: {best_config['strength']:.2f}")
+        print(f"   Steering Strategy: {best_config['steering_strategy']}")
+        print(f"   Token Aggregation: {best_config['token_aggregation']}")
+        print(f"   Prompt Construction: {best_config['prompt_construction']}")
+        print(f"   Difference Score: {best_config['difference_score']:.3f}")
+        print(f"   Quality Score: {best_config['quality_score']:.3f}")
+        print(f"   Alignment Score: {best_config['alignment_score']:.3f}")
+        print(f"   Overall Score: {best_config['overall_score']:.3f}")
+        # Save best steering vector
+        vector_path = os.path.join(args.output_dir, "vectors", f"{trait_name}_optimal.pt")
+        torch.save(
+            {
+                "steering_vector": best_steering_vector,
+                "layer": best_config["layer"],
+                "layer_index": best_config["layer"],
+                "strength": best_config["strength"],
+                "steering_strategy": best_config["steering_strategy"],
+                "token_aggregation": best_config["token_aggregation"],
+                "prompt_construction": best_config["prompt_construction"],
+                "trait": trait,
+                "trait_name": trait_name,
+                "model": args.model,
+                "method": "CAA",
+                "optimization_scores": {
+                    "difference": best_config["difference_score"],
+                    "quality": best_config["quality_score"],
+                    "alignment": best_config["alignment_score"],
+                    "overall": best_config["overall_score"],
+                },
+            },
+            vector_path,
+        )
+        print(f"\n💾 Saved optimal steering vector to: {vector_path}")
+    else:
+        print("\n⚠️ No valid configuration found")
+    # Save full results to JSON
+    results_file = os.path.join(args.output_dir, f"{trait_name}_optimization_results.json")
+    # best_config doesn't have steering_vector anymore (it's in best_steering_vector)
+    best_config_json = best_config
+    output_data = {
+        "model": args.model,
+        "trait": trait,
+        "trait_name": trait_name,
+        "num_pairs": args.num_pairs,
+        "num_test_prompts": args.num_test_prompts,
+        "layers_tested": layers_to_test,
+        "strengths_tested": [float(s) for s in strengths_to_test],
+        "steering_strategies_tested": steering_strategies_to_test,
+        "token_aggregations_tested": [t.value for t in token_aggregations_to_test],
+        "prompt_constructions_tested": [p.value for p in prompt_constructions_to_test],
+        "best_config": best_config_json,
+        "all_results": all_results,
+    }
+    with open(results_file, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"💾 Saved full results to: {results_file}")
+    # Remove checkpoint file after successful completion
+    if os.path.exists(checkpoint_file):
+        os.remove(checkpoint_file)
+        print(f"🧹 Removed checkpoint file: {checkpoint_file}")
+    if args.save_all_generation_examples and examples_file_path:
+        print(f"💾 Generation examples saved iteratively to: {examples_file_path}")
+    # Print usage example
+    print("\n📝 Usage Example:")
+    if best_config:
+        print("   python -m wisent.core.main multi-steer \\")
+        print(f"       --vector {vector_path}:{best_config['strength']:.1f} \\")
+        print(f"       --model {args.model} \\")
+        print(f"       --layer {best_config['layer']} \\")
+        print('       --prompt "Your prompt here"')
+    print(f"\n{'=' * 80}\n")
+    return {
+        "action": "personalization",
+        "trait": trait,
+        "trait_name": trait_name,
+        "best_config": best_config_json,
+        "results_file": results_file,
+        "vector_path": vector_path if best_config else None,
+    }
+def execute_multi_personalization(args, model):
+    """
+    Execute multi-trait joint personalization optimization.
+    This finds a SINGLE optimal configuration (layer, token_aggregation, prompt_construction)
+    that works well for ALL traits, then optimizes strength per-trait individually.
+    The approach:
+    1. Generate synthetic contrastive pairs for each trait
+    2. For each (layer, token_agg, prompt_const) configuration:
+       - Compute steering vectors for ALL traits
+       - Find optimal strength for each trait individually
+       - Compute combined score = mean(trait_scores)
+    3. Select the configuration with highest combined score
+    4. Return: shared (layer, token_agg, prompt_const) + per-trait strength
+    """
+    import os
+    import torch
+    from wisent.core.activations.activations_collector import ActivationCollector
+    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
+    from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
+    from wisent.core.evaluators.steering_evaluators import PersonalizationEvaluator
+    from wisent.core.models.core.atoms import SteeringPlan, SteeringVector
+    from wisent.core.cli.steering_method_trainer import create_steering_method
+    from wisent.core.synthetic.cleaners.pairs_cleaner import PairsCleaner
+    from wisent.core.synthetic.db_instructions.mini_dp import Default_DB_Instructions
+    from wisent.core.synthetic.generators.diversities.methods.fast_diversity import FastDiversity
+    from wisent.core.synthetic.generators.pairs_generator import SyntheticContrastivePairsGenerator
+    traits = args.traits
+    trait_names = args.trait_names or [t.split()[0].lower() for t in traits]
+    if len(trait_names) != len(traits):
+        print(f"Error: Number of --trait-name args ({len(trait_names)}) must match --trait args ({len(traits)})")
+        return None
+    print(f"\n{'=' * 80}", flush=True)
+    print("🎭 MULTI-TRAIT JOINT PERSONALIZATION OPTIMIZATION", flush=True)
+    print(f"{'=' * 80}", flush=True)
+    print(f"   Model: {args.model}", flush=True)
+    print(f"   Traits: {len(traits)}", flush=True)
+    for i, (trait, name) in enumerate(zip(traits, trait_names)):
+        print(f"      {i + 1}. {name}: {trait[:50]}...", flush=True)
+    print(f"   Num Pairs per trait: {args.num_pairs}", flush=True)
+    print(f"   Num Test Prompts: {args.num_test_prompts}", flush=True)
+    print(f"   Output Directory: {args.output_dir}", flush=True)
+    print(f"{'=' * 80}\n", flush=True)
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, "vectors"), exist_ok=True)
+    # Determine layers to test - default to middle 50% of layers where steering works best
+    if args.layers:
+        layers_to_test = args.layers
+    else:
+        num_layers = model.num_layers
+        # Test middle 50% of layers (e.g., layers 8-20 for a 28-layer model)
+        start_layer = max(1, num_layers // 4)
+        end_layer = min(num_layers, 3 * num_layers // 4)
+        layers_to_test = list(range(start_layer, end_layer + 1))
+    # Determine strengths to test
+    min_strength, max_strength = args.strength_range
+    strengths_to_test = np.linspace(min_strength, max_strength, args.num_strength_steps)
+    # Token aggregation strategies to test
+    token_aggregations_to_test = [
+        ActivationAggregationStrategy.LAST_TOKEN,
+        ActivationAggregationStrategy.MEAN_POOLING,
+        ActivationAggregationStrategy.FIRST_TOKEN,
+        ActivationAggregationStrategy.MAX_POOLING,
+    ]
+    # Prompt construction strategies to test
+    prompt_constructions_to_test = [
+        PromptConstructionStrategy.CHAT_TEMPLATE,
+        PromptConstructionStrategy.DIRECT_COMPLETION,
+        PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
+        PromptConstructionStrategy.ROLE_PLAYING,
+        PromptConstructionStrategy.MULTIPLE_CHOICE,
+    ]
+    # Use a fixed steering strategy (initial_only works well for multi-trait)
+    steering_strategy = "initial_only"
+    total_shared_configs = len(layers_to_test) * len(token_aggregations_to_test) * len(prompt_constructions_to_test)
+    print("📊 Search Space:", flush=True)
+    print(f"   Shared configs (layer × token_agg × prompt_const): {total_shared_configs}", flush=True)
+    print(f"   Strengths per trait: {len(strengths_to_test)}", flush=True)
+    print(f"   Steering strategy: {steering_strategy} (fixed)", flush=True)
+    print("\n", flush=True)
+    # Step 1: Generate synthetic contrastive pairs for each trait
+    print(f"🔧 Step 1: Generating synthetic pairs for {len(traits)} traits...", flush=True)
+    trait_pairs = {}
+    for trait, name in zip(traits, trait_names):
+        print(f"\n   Generating pairs for '{name}'...", flush=True)
+        cleaner = PairsCleaner(steps=[])
+        diversity = FastDiversity()
+        db_instructions = Default_DB_Instructions()
+        pair_generator = SyntheticContrastivePairsGenerator(
+            model=model,
+            generation_config=get_generate_kwargs(max_new_tokens=150),
+            contrastive_set_name=f"{name}_pairs",
+            trait_description=trait,
+            trait_label=name,
+            db_instructions=db_instructions,
+            cleaner=cleaner,
+            diversity=diversity,
+        )
+        pair_set, _ = pair_generator.generate(num_pairs=args.num_pairs)
+        trait_pairs[name] = {"trait": trait, "pairs": pair_set.pairs}
+        print(f"      ✓ Generated {len(pair_set.pairs)} pairs for '{name}'", flush=True)
+    # Test prompts for evaluation
+    test_prompts = [
+        "What's your favorite food?",
+        "How do you spend your weekends?",
+        "What motivates you in life?",
+        "How do you handle setbacks?",
+        "What's your opinion on teamwork?",
+    ][: args.num_test_prompts]
+    print(f"\n📝 Test prompts: {test_prompts}", flush=True)
+    # Initialize collector
+    collector = ActivationCollector(model=model, store_device="cpu")
+    # Track results
+    all_results = {}
+    best_shared_config = None
+    best_combined_score = -1.0
+    best_per_trait_strengths = {}
+    best_steering_vectors = {}
+    best_overall_sample_responses = []
+    # Step 2: Test each shared configuration
+    print(f"\n🎯 Step 2: Testing {total_shared_configs} shared configurations...", flush=True)
+    config_count = 0
+    for token_agg in token_aggregations_to_test:
+        for prompt_const in prompt_constructions_to_test:
+            for layer in layers_to_test:
+                config_count += 1
+                layer_str = str(layer)
+                shared_config_key = f"L{layer}_T:{token_agg.value}_P:{prompt_const.value}"
+                print(f"\n[{config_count}/{total_shared_configs}] {shared_config_key}", flush=True)
+                # Compute steering vectors for each trait with this config
+                trait_vectors = {}
+                for name, data in trait_pairs.items():
+                    pairs = data["pairs"]
+                    pos_acts = []
+                    neg_acts = []
+                    for pair in pairs:
+                        updated_pair = collector.collect_for_pair(
+                            pair,
+                            layers=[layer_str],
+                            aggregation=token_agg,
+                            prompt_strategy=prompt_const,
+                            return_full_sequence=False,
+                            normalize_layers=False,
+                        )
+                        if (
+                            updated_pair.positive_response.layers_activations
+                            and layer_str in updated_pair.positive_response.layers_activations
+                        ):
+                            act = updated_pair.positive_response.layers_activations[layer_str]
+                            if act is not None:
+                                pos_acts.append(act)
+                        if (
+                            updated_pair.negative_response.layers_activations
+                            and layer_str in updated_pair.negative_response.layers_activations
+                        ):
+                            act = updated_pair.negative_response.layers_activations[layer_str]
+                            if act is not None:
+                                neg_acts.append(act)
+                    if len(pos_acts) == 0 or len(neg_acts) == 0:
+                        print(f"      ⚠️ No activations for '{name}' - skipping config", flush=True)
+                        trait_vectors = None
+                        break
+                    steering_method = create_steering_method("CAA", args)
+                    steering_vector = steering_method.train_for_layer(pos_acts, neg_acts)
+                    trait_vectors[name] = steering_vector
+                if trait_vectors is None:
+                    continue
+                # Use Latin Hypercube Sampling to efficiently explore strength space
+                # Instead of testing all N^T combinations, sample ~20 representative points
+                import random
+                from itertools import product
+                best_combined_score_for_config = -1.0
+                best_strengths_for_config = dict.fromkeys(trait_names, strengths_to_test[0])
+                best_sample_responses = []
+                # Generate strength combinations - use sampling to reduce search space
+                all_strength_combos = list(product(strengths_to_test, repeat=len(trait_names)))
+                # If too many combinations, sample a subset that includes edges and random middle points
+                max_samples = 25  # Test at most 25 combinations per config
+                if len(all_strength_combos) > max_samples:
+                    # Always include corner cases (min, max for each trait)
+                    corners = [
+                        tuple([strengths_to_test[0]] * len(trait_names)),  # All min
+                        tuple([strengths_to_test[-1]] * len(trait_names)),  # All max
+                        tuple([strengths_to_test[len(strengths_to_test) // 2]] * len(trait_names)),  # All mid
+                    ]
+                    # Add some diagonal samples
+                    for i in range(len(strengths_to_test)):
+                        corners.append(tuple([strengths_to_test[i]] * len(trait_names)))
+                    # Randomly sample the rest
+                    remaining = max_samples - len(set(corners))
+                    random.seed(42)  # For reproducibility
+                    other_combos = [c for c in all_strength_combos if c not in corners]
+                    sampled = random.sample(other_combos, min(remaining, len(other_combos)))
+                    strength_combos = list(set(corners)) + sampled
+                else:
+                    strength_combos = all_strength_combos
+                # Pre-generate baseline responses ONCE per config (they don't depend on steering)
+                baseline_responses = []
+                for prompt in test_prompts:
+                    baseline = model.generate(
+                        [[{"role": "user", "content": prompt}]],
+                        **get_generate_kwargs(max_new_tokens=args.max_new_tokens),
+                        use_steering=False,
+                    )[0]
+                    baseline_responses.append(baseline)
+                num_strength_combos = len(strength_combos)
+                for combo_idx, strength_combo in enumerate(strength_combos):
+                    if args.verbose and combo_idx % 5 == 0:
+                        import sys
+                        sys.stdout.write(f"\r      Testing strength {combo_idx + 1}/{num_strength_combos}...")
+                        sys.stdout.flush()
+                    current_strengths = dict(zip(trait_names, strength_combo))
+                    # Create COMBINED steering plan with ALL vectors at once
+                    combined_vector = None
+                    for name, strength in current_strengths.items():
+                        scaled_vector = trait_vectors[name] * float(strength)
+                        if combined_vector is None:
+                            combined_vector = scaled_vector.clone()
+                        else:
+                            combined_vector = combined_vector + scaled_vector
+                    steering_vec = SteeringVector(vector=combined_vector, scale=1.0)
+                    steering_plan = SteeringPlan(
+                        layers={layer_str: steering_vec},
+                        layers_description=[f"Multi-trait combined: {'+'.join(trait_names)}"],
+                    )
+                    # Generate only steered responses (baselines were pre-generated)
+                    steered_responses = []
+                    for prompt in test_prompts:
+                        model.apply_steering(steering_plan)
+                        steered = model.generate(
+                            [[{"role": "user", "content": prompt}]],
+                            **get_generate_kwargs(max_new_tokens=args.max_new_tokens),
+                            use_steering=True,
+                            steering_plan=steering_plan,
+                        )[0]
+                        model.clear_steering()
+                        steered_responses.append(steered)
+                    # Evaluate combined output against ALL traits together (static methods)
+                    difference_score = PersonalizationEvaluator._evaluate_difference(baseline_responses, steered_responses)
+                    quality_score = PersonalizationEvaluator._evaluate_quality(steered_responses)
+                    # Compute alignment score against COMBINED trait description
+                    # For multi-trait, combine positive/negative examples from all traits
+                    combined_trait_description = " AND ".join([trait_pairs[name]["trait"] for name in trait_names])
+                    all_positive_examples = []
+                    all_negative_examples = []
+                    for name in trait_names:
+                        all_positive_examples.extend([p.positive_response.model_response for p in trait_pairs[name]["pairs"]])
+                        all_negative_examples.extend([p.negative_response.model_response for p in trait_pairs[name]["pairs"]])
+                    alignment_score = PersonalizationEvaluator.estimate_alignment(
+                        steered_responses, combined_trait_description, all_positive_examples, all_negative_examples
+                    )
+                    if difference_score < 0.3:
+                        overall_score = 0.0
+                    else:
+                        overall_score = 0.2 * difference_score + 0.3 * quality_score + 0.5 * alignment_score
+                    if overall_score > best_combined_score_for_config:
+                        best_combined_score_for_config = overall_score
+                        best_strengths_for_config = current_strengths.copy()
+                        best_sample_responses = list(zip(test_prompts, baseline_responses, steered_responses))
+                # Store per-trait strengths from best combo
+                trait_best_strengths = best_strengths_for_config
+                combined_score = best_combined_score_for_config
+                print(
+                    f"      Strengths: {', '.join([f'{n}={s:.2f}' for n, s in trait_best_strengths.items()])}",
+                    flush=True,
+                )
+                print(f"      → Combined score (all traits at once): {combined_score:.3f}", flush=True)
+                # Show sample responses for this config
+                if best_sample_responses and args.verbose:
+                    print("\n      📝 Sample responses:", flush=True)
+                    for prompt, baseline, steered in best_sample_responses[:2]:
+                        print(f"         Prompt: {prompt}", flush=True)
+                        print(f"         Baseline: {baseline[:100]}...", flush=True)
+                        print(f"         Steered:  {steered[:100]}...", flush=True)
+                # Store result
+                all_results[shared_config_key] = {
+                    "layer": layer,
+                    "token_aggregation": token_agg.value,
+                    "prompt_construction": prompt_const.value,
+                    "steering_strategy": steering_strategy,
+                    "per_trait_strengths": trait_best_strengths,
+                    "combined_score": float(combined_score),
+                    "sample_responses": [
+                        {"prompt": p, "baseline": b, "steered": s} for p, b, s in best_sample_responses
+                    ]
+                    if best_sample_responses
+                    else [],
+                }
+                if combined_score > best_combined_score:
+                    best_combined_score = combined_score
+                    best_shared_config = {
+                        "layer": layer,
+                        "token_aggregation": token_agg.value,
+                        "prompt_construction": prompt_const.value,
+                        "steering_strategy": steering_strategy,
+                    }
+                    best_per_trait_strengths = trait_best_strengths.copy()
+                    best_steering_vectors = {name: v.clone() for name, v in trait_vectors.items()}
+                    best_overall_sample_responses = best_sample_responses.copy() if best_sample_responses else []
+    # Step 3: Save results
+    print(f"\n{'=' * 80}", flush=True)
+    print("📊 MULTI-TRAIT OPTIMIZATION COMPLETE", flush=True)
+    print(f"{'=' * 80}", flush=True)
+    vector_paths = {}
+    if best_shared_config:
+        print("\n✅ Best Shared Configuration:", flush=True)
+        print(f"   Layer: {best_shared_config['layer']}", flush=True)
+        print(f"   Token Aggregation: {best_shared_config['token_aggregation']}", flush=True)
+        print(f"   Prompt Construction: {best_shared_config['prompt_construction']}", flush=True)
+        print(f"   Steering Strategy: {best_shared_config['steering_strategy']}", flush=True)
+        print("\n✅ Per-Trait Optimal Strengths:", flush=True)
+        for name, strength in best_per_trait_strengths.items():
+            print(f"      {name}: {strength:.2f}", flush=True)
+        print(f"\n   Combined Score: {best_combined_score:.3f}", flush=True)
+        # Print sample responses from the best configuration
+        if best_overall_sample_responses:
+            print(f"\n{'=' * 80}", flush=True)
+            print("📝 SAMPLE RESPONSES (Best Configuration)", flush=True)
+            print(f"{'=' * 80}", flush=True)
+            for prompt, baseline, steered in best_overall_sample_responses:
+                print(f"\n   🗣️ Prompt: {prompt}", flush=True)
+                print("\n   📄 Baseline Response:", flush=True)
+                print(f"      {baseline}", flush=True)
+                print("\n   🎯 Steered Response (evil + italian):", flush=True)
+                print(f"      {steered}", flush=True)
+                print(f"\n   {'-' * 70}", flush=True)
+        # Save steering vectors for each trait
+        for name in trait_names:
+            vector_path = os.path.join(args.output_dir, "vectors", f"{name}_optimal.pt")
+            torch.save(
+                {
+                    "steering_vector": best_steering_vectors[name],
+                    "layer": best_shared_config["layer"],
+                    "layer_index": best_shared_config["layer"],
+                    "strength": best_per_trait_strengths[name],
+                    "steering_strategy": best_shared_config["steering_strategy"],
+                    "token_aggregation": best_shared_config["token_aggregation"],
+                    "prompt_construction": best_shared_config["prompt_construction"],
+                    "trait": trait_pairs[name]["trait"],
+                    "trait_name": name,
+                    "model": args.model,
+                    "method": "CAA",
+                    "multi_trait_optimization": True,
+                },
+                vector_path,
+            )
+            vector_paths[name] = vector_path
+            print(f"\n💾 Saved {name} vector to: {vector_path}", flush=True)
+    else:
+        print("\n⚠️ No valid configuration found", flush=True)
+    # Save full results
+    results_file = os.path.join(args.output_dir, "multi_trait_optimization_results.json")
+    output_data = {
+        "model": args.model,
+        "traits": {name: trait_pairs[name]["trait"] for name in trait_names},
+        "num_pairs_per_trait": args.num_pairs,
+        "num_test_prompts": args.num_test_prompts,
+        "layers_tested": layers_to_test,
+        "strengths_tested": [float(s) for s in strengths_to_test],
+        "token_aggregations_tested": [t.value for t in token_aggregations_to_test],
+        "prompt_constructions_tested": [p.value for p in prompt_constructions_to_test],
+        "best_shared_config": best_shared_config,
+        "best_per_trait_strengths": best_per_trait_strengths,
+        "best_combined_score": best_combined_score,
+        "best_sample_responses": [
+            {"prompt": p, "baseline": b, "steered": s} for p, b, s in best_overall_sample_responses
+        ]
+        if best_overall_sample_responses
+        else [],
+        "all_results": all_results,
+    }
+    with open(results_file, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"\n💾 Saved full results to: {results_file}", flush=True)
+    # Print usage example
+    if best_shared_config and vector_paths:
+        print("\n📝 Usage Example:", flush=True)
+        print("   python -m wisent.core.main multi-steer \\", flush=True)
+        for name in trait_names:
+            print(f"       --vector {vector_paths[name]}:{best_per_trait_strengths[name]:.1f} \\", flush=True)
+        print(f"       --model {args.model} \\", flush=True)
+        print(f"       --layer {best_shared_config['layer']} \\", flush=True)
+        print('       --prompt "Your prompt here"', flush=True)
+    print(f"\n{'=' * 80}\n", flush=True)
+    return {
+        "action": "multi-personalization",
+        "traits": trait_names,
+        "best_shared_config": best_shared_config,
+        "best_per_trait_strengths": best_per_trait_strengths,
+        "best_combined_score": best_combined_score,
+        "results_file": results_file,
+        "vector_paths": vector_paths,
+    }