PyPI - wisent - Versions diffs - 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show

wisent/__init__.py +1 -8
wisent/benchmarks/__init__.py +0 -0
wisent/benchmarks/coding/__init__.py +0 -0
wisent/benchmarks/coding/metrics/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
wisent/benchmarks/coding/metrics/evaluator.py +275 -0
wisent/benchmarks/coding/metrics/passk.py +66 -0
wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
wisent/benchmarks/coding/providers/__init__.py +18 -0
wisent/benchmarks/coding/providers/core/__init__.py +0 -0
wisent/benchmarks/coding/providers/core/atoms.py +31 -0
wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
wisent/classifiers/__init__.py +0 -0
wisent/classifiers/core/__init__.py +0 -0
wisent/classifiers/core/atoms.py +747 -0
wisent/classifiers/models/__init__.py +0 -0
wisent/classifiers/models/logistic.py +29 -0
wisent/classifiers/models/mlp.py +47 -0
wisent/cli/__init__.py +0 -0
wisent/cli/classifiers/__init__.py +0 -0
wisent/cli/classifiers/classifier_rotator.py +137 -0
wisent/cli/cli_logger.py +142 -0
wisent/cli/data_loaders/__init__.py +0 -0
wisent/cli/data_loaders/data_loader_rotator.py +96 -0
wisent/cli/evaluators/__init__.py +0 -0
wisent/cli/evaluators/evaluator_rotator.py +148 -0
wisent/cli/steering_methods/__init__.py +0 -0
wisent/cli/steering_methods/steering_rotator.py +110 -0
wisent/cli/wisent_cli/__init__.py +0 -0
wisent/cli/wisent_cli/commands/__init__.py +0 -0
wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
wisent/cli/wisent_cli/commands/listing.py +154 -0
wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
wisent/cli/wisent_cli/main.py +93 -0
wisent/cli/wisent_cli/shell.py +80 -0
wisent/cli/wisent_cli/ui.py +69 -0
wisent/cli/wisent_cli/util/__init__.py +0 -0
wisent/cli/wisent_cli/util/aggregations.py +43 -0
wisent/cli/wisent_cli/util/parsing.py +126 -0
wisent/cli/wisent_cli/version.py +4 -0
wisent/core/__init__.py +27 -0
wisent/core/activations/__init__.py +0 -0
wisent/core/activations/activations_collector.py +338 -0
wisent/core/activations/core/__init__.py +0 -0
wisent/core/activations/core/atoms.py +216 -0
wisent/core/agent/__init__.py +18 -0
wisent/core/agent/budget.py +638 -0
wisent/core/agent/device_benchmarks.py +685 -0
wisent/core/agent/diagnose/__init__.py +55 -0
wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
wisent/core/agent/diagnose/create_classifier.py +1154 -0
wisent/core/agent/diagnose/response_diagnostics.py +268 -0
wisent/core/agent/diagnose/select_classifiers.py +506 -0
wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
wisent/core/agent/diagnose/tasks/__init__.py +33 -0
wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
wisent/core/agent/diagnose.py +242 -0
wisent/core/agent/steer.py +212 -0
wisent/core/agent/timeout.py +134 -0
wisent/core/autonomous_agent.py +1234 -0
wisent/core/bigcode_integration.py +583 -0
wisent/core/contrastive_pairs/__init__.py +15 -0
wisent/core/contrastive_pairs/core/__init__.py +0 -0
wisent/core/contrastive_pairs/core/atoms.py +45 -0
wisent/core/contrastive_pairs/core/buliders.py +59 -0
wisent/core/contrastive_pairs/core/pair.py +178 -0
wisent/core/contrastive_pairs/core/response.py +152 -0
wisent/core/contrastive_pairs/core/serialization.py +300 -0
wisent/core/contrastive_pairs/core/set.py +133 -0
wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
wisent/core/data_loaders/__init__.py +0 -0
wisent/core/data_loaders/core/__init__.py +0 -0
wisent/core/data_loaders/core/atoms.py +98 -0
wisent/core/data_loaders/loaders/__init__.py +0 -0
wisent/core/data_loaders/loaders/custom.py +120 -0
wisent/core/data_loaders/loaders/lm_loader.py +218 -0
wisent/core/detection_handling.py +257 -0
wisent/core/download_full_benchmarks.py +1386 -0
wisent/core/evaluators/__init__.py +0 -0
wisent/core/evaluators/oracles/__init__.py +0 -0
wisent/core/evaluators/oracles/interactive.py +73 -0
wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
wisent/core/evaluators/oracles/user_specified.py +67 -0
wisent/core/hyperparameter_optimizer.py +429 -0
wisent/core/lm_eval_harness_ground_truth.py +1396 -0
wisent/core/log_likelihoods_evaluator.py +321 -0
wisent/core/managed_cached_benchmarks.py +595 -0
wisent/core/mixed_benchmark_sampler.py +364 -0
wisent/core/model_config_manager.py +330 -0
wisent/core/model_persistence.py +317 -0
wisent/core/models/__init__.py +0 -0
wisent/core/models/core/__init__.py +0 -0
wisent/core/models/core/atoms.py +460 -0
wisent/core/models/wisent_model.py +727 -0
wisent/core/multi_steering.py +316 -0
wisent/core/optuna/__init__.py +57 -0
wisent/core/optuna/classifier/__init__.py +25 -0
wisent/core/optuna/classifier/activation_generator.py +349 -0
wisent/core/optuna/classifier/classifier_cache.py +509 -0
wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
wisent/core/optuna/steering/__init__.py +0 -0
wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
wisent/core/optuna/steering/data_utils.py +342 -0
wisent/core/optuna/steering/metrics.py +474 -0
wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
wisent/core/optuna/steering/steering_optimization.py +1111 -0
wisent/core/parser.py +1668 -0
wisent/core/prompts/__init__.py +0 -0
wisent/core/prompts/core/__init__.py +0 -0
wisent/core/prompts/core/atom.py +57 -0
wisent/core/prompts/core/prompt_formater.py +157 -0
wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
wisent/core/representation.py +5 -0
wisent/core/sample_size_optimizer.py +648 -0
wisent/core/sample_size_optimizer_v2.py +355 -0
wisent/core/save_results.py +277 -0
wisent/core/steering.py +652 -0
wisent/core/steering_method.py +26 -0
wisent/core/steering_methods/__init__.py +0 -0
wisent/core/steering_methods/core/__init__.py +0 -0
wisent/core/steering_methods/core/atoms.py +153 -0
wisent/core/steering_methods/methods/__init__.py +0 -0
wisent/core/steering_methods/methods/caa.py +44 -0
wisent/core/steering_optimizer.py +1297 -0
wisent/core/task_interface.py +132 -0
wisent/core/task_selector.py +189 -0
wisent/core/tasks/__init__.py +175 -0
wisent/core/tasks/aime_task.py +141 -0
wisent/core/tasks/file_task.py +211 -0
wisent/core/tasks/hle_task.py +180 -0
wisent/core/tasks/hmmt_task.py +119 -0
wisent/core/tasks/livecodebench_task.py +201 -0
wisent/core/tasks/livemathbench_task.py +158 -0
wisent/core/tasks/lm_eval_task.py +455 -0
wisent/core/tasks/math500_task.py +84 -0
wisent/core/tasks/polymath_task.py +146 -0
wisent/core/tasks/supergpqa_task.py +220 -0
wisent/core/time_estimator.py +149 -0
wisent/core/timing_calibration.py +174 -0
wisent/core/tracking/__init__.py +54 -0
wisent/core/tracking/latency.py +618 -0
wisent/core/tracking/memory.py +359 -0
wisent/core/trainers/__init__.py +0 -0
wisent/core/trainers/core/__init__.py +11 -0
wisent/core/trainers/core/atoms.py +45 -0
wisent/core/trainers/steering_trainer.py +271 -0
wisent/core/user_model_config.py +158 -0
wisent/opti/__init__.py +0 -0
wisent/opti/core/__init__.py +0 -0
wisent/opti/core/atoms.py +175 -0
wisent/opti/methods/__init__.py +0 -0
wisent/opti/methods/opti_classificator.py +172 -0
wisent/opti/methods/opti_steering.py +138 -0
wisent/synthetic/__init__.py +0 -0
wisent/synthetic/cleaners/__init__.py +0 -0
wisent/synthetic/cleaners/core/__init__.py +0 -0
wisent/synthetic/cleaners/core/atoms.py +58 -0
wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
wisent/synthetic/cleaners/methods/__init__.py +0 -0
wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
wisent/synthetic/db_instructions/__init__.py +0 -0
wisent/synthetic/db_instructions/core/__init__.py +0 -0
wisent/synthetic/db_instructions/core/atoms.py +25 -0
wisent/synthetic/db_instructions/mini_dp.py +37 -0
wisent/synthetic/generators/__init__.py +0 -0
wisent/synthetic/generators/core/__init__.py +0 -0
wisent/synthetic/generators/core/atoms.py +73 -0
wisent/synthetic/generators/diversities/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/core.py +68 -0
wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
wisent/synthetic/generators/pairs_generator.py +179 -0
wisent-0.5.1.dist-info/METADATA +67 -0
wisent-0.5.1.dist-info/RECORD +218 -0
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
wisent/activations/__init__.py +0 -9
wisent/activations/client.py +0 -97
wisent/activations/extractor.py +0 -251
wisent/activations/models.py +0 -95
wisent/client.py +0 -45
wisent/control_vector/__init__.py +0 -9
wisent/control_vector/client.py +0 -85
wisent/control_vector/manager.py +0 -168
wisent/control_vector/models.py +0 -70
wisent/inference/__init__.py +0 -9
wisent/inference/client.py +0 -103
wisent/inference/inferencer.py +0 -250
wisent/inference/models.py +0 -66
wisent/utils/__init__.py +0 -3
wisent/utils/auth.py +0 -30
wisent/utils/http.py +0 -228
wisent/version.py +0 -3
wisent-0.1.1.dist-info/METADATA +0 -142
wisent-0.1.1.dist-info/RECORD +0 -23
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0

wisent/core/optuna/steering/steering_optimization.py ADDED Viewed

@@ -0,0 +1,1111 @@
+"""
+Steering optimization module for improving benchmark performance.
+This module handles training and optimizing different steering methods that can
+improve model performance on benchmarks by steering internal activations.
+"""
+import logging
+import traceback
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from tqdm import tqdm
+from wisent_guard.core.activations.core import ActivationAggregationStrategy
+from wisent_guard.core.classifier.classifier import Classifier
+from wisent_guard.core.contrastive_pairs.contrastive_pair import ContrastivePair
+from wisent_guard.core.contrastive_pairs.contrastive_pair_set import ContrastivePairSet
+from wisent_guard.core.optuna.classifier import (
+    CacheConfig,
+    ClassifierCache,
+    ClassifierOptimizationConfig,
+    GenerationConfig,
+    OptunaClassifierOptimizer,
+)
+from wisent_guard.core.optuna.steering import data_utils, metrics
+from wisent_guard.core.response import Response
+from wisent_guard.core.steering_methods.dac import DAC
+from wisent_guard.core.task_interface import get_task
+logger = logging.getLogger(__name__)
+@dataclass
+class SteeringMethodConfig(ABC):
+    """Base configuration for steering methods."""
+    method_name: str = "base"
+    layers: List[int] = None
+    strengths: List[float] = None
+    def __post_init__(self):
+        if self.layers is None:
+            self.layers = []
+        if self.strengths is None:
+            self.strengths = [1.0]
+@dataclass
+class DACConfig(SteeringMethodConfig):
+    """Configuration for DAC (Dynamic Activation Composition) steering method."""
+    method_name: str = "dac"
+    entropy_thresholds: List[float] = None
+    ptop_values: List[float] = None
+    max_alpha_values: List[float] = None
+    def __post_init__(self):
+        super().__post_init__()
+        if self.entropy_thresholds is None:
+            self.entropy_thresholds = [1.0]
+        if self.ptop_values is None:
+            self.ptop_values = [0.4]
+        if self.max_alpha_values is None:
+            self.max_alpha_values = [2.0]
+@dataclass
+class SteeringResult:
+    """Results from training and evaluating a steering method configuration."""
+    method_name: str
+    layer: int
+    hyperparameters: Dict[str, Any]
+    benchmark_metrics: Dict[str, float]
+    training_success: bool
+    training_stats: Dict[str, Any] = None
+    baseline_metrics: Dict[str, float] = None
+    comparative_metrics: Dict[str, Any] = None
+class SteeringMethodTrainer(ABC):
+    """Abstract base class for training different steering methods."""
+    @abstractmethod
+    def create_method_instance(self, hyperparams: Dict[str, Any], device: str) -> Any:
+        """Create an instance of the steering method with given hyperparameters."""
+    @abstractmethod
+    def train_method(
+        self,
+        method_instance: Any,
+        train_samples: List[Dict],
+        layer: int,
+        model,
+        tokenizer,
+        device: str,
+        task_name: str = "gsm8k",
+        max_new_tokens: int = 200,
+    ) -> Tuple[bool, Dict[str, Any]]:
+        """Train the steering method on training data."""
+    @abstractmethod
+    def apply_steering_and_evaluate(
+        self,
+        method_instance: Any,
+        evaluation_samples: List[Dict],
+        layer: int,
+        strength: float,
+        model,
+        tokenizer,
+        device: str,
+        batch_size: int,
+        max_length: int,
+        task_name: str = "gsm8k",
+        max_new_tokens: int = 200,
+    ) -> Tuple[List[str], List[str]]:
+        """Apply steering and generate predictions for evaluation."""
+class DACTrainer(SteeringMethodTrainer):
+    """Trainer for DAC (Dynamic Activation Composition) steering method."""
+    def create_method_instance(self, hyperparams: Dict[str, Any], device: str) -> DAC:
+        """Create DAC instance with specified hyperparameters."""
+        return DAC(
+            device=device,
+            dynamic_control=True,
+            entropy_threshold=hyperparams.get("entropy_threshold", 1.0),
+            ptop=hyperparams.get("ptop", 0.4),
+            max_alpha=hyperparams.get("max_alpha", 2.0),
+        )
+    def train_method(
+        self,
+        dac_instance: DAC,
+        train_samples: List[Dict],
+        layer: int,
+        model,
+        tokenizer,
+        device: str,
+        task_name: str = "gsm8k",
+        max_new_tokens: int = 200,
+    ) -> Tuple[bool, Dict[str, Any]]:
+        """Train DAC on training data to create steering vectors."""
+        try:
+            # Set model reference for KL computation
+            dac_instance.set_model_reference(model)
+            # Extract contrastive pairs from training data using task's extractor
+            contrastive_pairs = data_utils.get_task_contrastive_pairs(train_samples, task_name)
+            if not contrastive_pairs:
+                logger.warning(f"No contrastive pairs extracted from {task_name} training data")
+                return False, {"error": "No contrastive pairs"}
+            # Convert to ContrastivePairSet format
+            pair_set = self._create_pair_set_from_extracted_pairs(contrastive_pairs, layer, model, tokenizer, device)
+            # Train DAC
+            training_result = dac_instance.train(pair_set, layer)
+            success = training_result.get("success", False)
+            logger.debug(f"DAC training on layer {layer}: {'Success' if success else 'Failed'}")
+            return success, training_result
+        except Exception as e:
+            logger.error(f"DAC training failed on layer {layer}: {e}")
+            return False, {"error": str(e)}
+    def apply_steering_and_evaluate(
+        self,
+        dac_instance: DAC,
+        evaluation_samples: List[Dict],
+        layer: int,
+        strength: float,
+        model,
+        tokenizer,
+        device: str,
+        batch_size: int,
+        max_length: int,
+        task_name: str = "gsm8k",
+        max_new_tokens: int = 200,
+    ) -> Tuple[List[str], List[str]]:
+        """Apply DAC steering and generate predictions using task extractor."""
+        predictions = []
+        ground_truths = []
+        # Get the task and its extractor
+        task = get_task(task_name)
+        extractor = task.get_extractor()
+        # Pre-extract all questions and answers (optimization)
+        questions = []
+        answers = []
+        for sample in evaluation_samples:
+            qa_pair = extractor.extract_qa_pair(sample, task)
+            if not qa_pair:
+                logger.warning(f"Skipping sample - extractor couldn't extract QA pair: {sample.keys()}")
+                continue
+            questions.append(qa_pair["formatted_question"])
+            answers.append(qa_pair["correct_answer"])
+        # Process questions with steering in batches (optimized approach)
+        ground_truths.extend(answers)
+        # Handle different model architectures
+        if hasattr(model, "model") and hasattr(model.model, "layers"):
+            # LLaMA-style models
+            layer_module = model.model.layers[layer]
+        elif hasattr(model, "transformer") and hasattr(model.transformer, "h"):
+            # GPT2-style models
+            layer_module = model.transformer.h[layer]
+        else:
+            raise ValueError("Unsupported model architecture for DAC steering")
+        # Process in batches with steering
+        for i in tqdm(range(0, len(questions), batch_size), desc="Generating predictions with steering"):
+            batch_questions = questions[i : i + batch_size]
+            # First, get actual lengths (before padding) for proper steering
+            actual_lengths = []
+            for question in batch_questions:
+                tokens = tokenizer(question, return_tensors="pt")
+                actual_lengths.append(tokens["input_ids"].shape[1])
+            # Create batched steering hook that handles variable lengths
+            def create_batched_steering_hook(actual_lengths):
+                def steering_hook(module, input, output):
+                    hidden_states = output[0]  # [batch_size, seq_len, hidden_dim]
+                    # Apply steering to each sample's actual last token
+                    for j, actual_length in enumerate(actual_lengths):
+                        if j < hidden_states.shape[0]:  # Safety check for batch size
+                            # Get the actual last token (before padding)
+                            last_token = hidden_states[j : j + 1, actual_length - 1 : actual_length, :]
+                            steered = dac_instance.apply_steering(last_token, strength=strength)
+                            hidden_states[j : j + 1, actual_length - 1 : actual_length, :] = steered
+                    return (hidden_states,) + output[1:]
+                return steering_hook
+            # Register the batched hook
+            batched_hook = create_batched_steering_hook(actual_lengths)
+            handle = layer_module.register_forward_hook(batched_hook)
+            try:
+                # Tokenize batch with padding for generation
+                inputs = tokenizer(
+                    batch_questions, return_tensors="pt", padding=True, truncation=True, max_length=max_length
+                ).to(device)
+                with torch.no_grad():
+                    outputs = model.generate(
+                        **inputs,
+                        max_new_tokens=max_new_tokens,
+                        do_sample=True,
+                        temperature=0.7,
+                        pad_token_id=tokenizer.eos_token_id,
+                        use_cache=False,  # Disable cache to avoid cache_position errors
+                    )
+                # Decode responses for each item in batch
+                for j, (output, question) in enumerate(zip(outputs, batch_questions)):
+                    response = tokenizer.decode(output, skip_special_tokens=True)
+                    prediction = response[len(question) :].strip()
+                    predictions.append(prediction)
+            finally:
+                handle.remove()
+        return predictions, ground_truths
+    def _create_pair_set_from_extracted_pairs(
+        self, extracted_pairs: List[Dict], layer_index: int, model, tokenizer, device: str
+    ) -> ContrastivePairSet:
+        """Convert extracted pairs to ContrastivePairSet format with proper activation extraction."""
+        pair_set = ContrastivePairSet(name="dac_training", task_type="mathematical_reasoning")
+        logger.info(f"Creating {len(extracted_pairs)} contrastive pairs for layer {layer_index}")
+        for pair_data in tqdm(extracted_pairs, desc="Creating contrastive pairs"):
+            # Extract data from GSM8K format
+            try:
+                question = pair_data["question"]
+                correct_answer = pair_data["correct_answer"]
+                incorrect_answer = pair_data["incorrect_answer"]
+                # Extract activations for correct and incorrect responses
+                correct_activations = self._extract_activations_for_text(
+                    f"{question} {correct_answer}", layer_index, model, tokenizer, device
+                )
+                incorrect_activations = self._extract_activations_for_text(
+                    f"{question} {incorrect_answer}", layer_index, model, tokenizer, device
+                )
+                # Create Response objects
+                positive_response = Response(text=correct_answer, activations=correct_activations)
+                negative_response = Response(text=incorrect_answer, activations=incorrect_activations)
+                # Create ContrastivePair
+                contrastive_pair = ContrastivePair(
+                    prompt=question, positive_response=positive_response, negative_response=negative_response
+                )
+                pair_set.pairs.append(contrastive_pair)
+            except Exception as e:
+                logger.warning(f"Failed to create contrastive pair: {e}")
+                continue
+        logger.info(f"Successfully created ContrastivePairSet with {len(pair_set.pairs)} pairs")
+        return pair_set
+    def _extract_activations_for_text(self, text: str, layer_index: int, model, tokenizer, device: str) -> torch.Tensor:
+        """Extract activations from a specific layer for given text."""
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)
+        activations = []
+        def hook(module, input, output):
+            # Extract the last token's activations
+            hidden_states = output[0]
+            last_token_activations = hidden_states[:, -1, :]
+            activations.append(last_token_activations.detach().cpu())
+        # Handle different model architectures
+        if hasattr(model, "model") and hasattr(model.model, "layers"):
+            # LLaMA-style models
+            layer_module = model.model.layers[layer_index]
+        elif hasattr(model, "transformer") and hasattr(model.transformer, "h"):
+            # GPT2-style models
+            layer_module = model.transformer.h[layer_index]
+        else:
+            raise ValueError("Unsupported model architecture for activation extraction")
+        handle = layer_module.register_forward_hook(hook)
+        with torch.no_grad():
+            model(**inputs)
+        handle.remove()
+        return activations[0].squeeze(0)
+class SteeringOptimizer:
+    """
+    Optimizes steering methods for improving benchmark performance.
+    The steering optimization process:
+    1. Train steering methods on training data
+    2. Evaluate steering performance on validation data using benchmark metrics
+    3. Select best configuration based on benchmark performance
+    4. Test final steering method on test data
+    """
+    def __init__(self, cache_config: Optional[CacheConfig] = None):
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        self.trainers = {"dac": DACTrainer()}
+        # Initialize classifier cache for reusing trained classifiers
+        if cache_config is None:
+            cache_config = CacheConfig(cache_dir="./steering_classifier_cache")
+        self.classifier_cache = ClassifierCache(cache_config)
+        # Session-level classifier caching for current optimization run
+        self._session_classifier = None  # Best classifier for current session
+        self._session_classifier_metadata = {}  # Layer, model_type, performance, etc.
+        self._session_cache_key = None  # Track current session
+    def register_trainer(self, method_name: str, trainer: SteeringMethodTrainer):
+        """Register a new steering method trainer."""
+        self.trainers[method_name] = trainer
+        self.logger.info(f"Registered trainer for steering method: {method_name}")
+    def optimize_steering_hyperparameters(
+        self,
+        config: SteeringMethodConfig,
+        classifier_optimization_config: ClassifierOptimizationConfig,
+        train_samples: List[Dict],
+        validation_samples: List[Dict],
+        model,
+        tokenizer,
+        device: str,
+        batch_size: int = 32,
+        max_length: int = 512,
+        task_name: str = "gsm8k",
+        max_new_tokens: int = 200,
+    ) -> Tuple[Dict[str, Any], List[SteeringResult]]:
+        """
+        Optimize hyperparameters for a steering method using grid search.
+        Args:
+            config: Steering method configuration with hyperparameter ranges
+            classifier_optimization_config: Configuration for classifier optimization
+            train_samples: Training samples for method training
+            validation_samples: Validation samples for evaluation
+            model: Language model
+            tokenizer: Model tokenizer
+            device: Device to run on
+            batch_size: Batch size for processing
+            max_length: Maximum sequence length
+            task_name: Task name for evaluation
+            max_new_tokens: Maximum tokens to generate
+        Returns:
+            Tuple of (best_config, all_results)
+        """
+        method_name = config.method_name
+        if method_name not in self.trainers:
+            raise ValueError(f"No trainer registered for method: {method_name}")
+        trainer = self.trainers[method_name]
+        # Load best classifier once at the start of optimization
+        self.logger.info("Loading/training classifier for evaluation...")
+        contrastive_pairs = data_utils.get_task_contrastive_pairs(train_samples, task_name)
+        classifier = self.load_or_find_best_classifier(
+            model=model, optimization_config=classifier_optimization_config, contrastive_pairs=contrastive_pairs
+        )
+        if classifier is None:
+            raise ValueError(
+                f"Could not load or train classifier for {classifier_optimization_config.model_name}/{task_name}"
+            )
+        self.logger.info(f"Using classifier: {self._session_classifier_metadata}")
+        # Collect baseline predictions once for all trials
+        self.logger.info("Collecting baseline predictions for comparison...")
+        baseline_predictions, ground_truths = self.collect_baseline_predictions(
+            validation_samples, model, tokenizer, classifier, device, batch_size, max_length, task_name, max_new_tokens
+        )
+        # Calculate baseline metrics with integrated classifier scoring
+        classifier_scorer = lambda predictions, description: self.score_predictions_with_classifier(
+            predictions, model, tokenizer, device, max_length, description
+        )
+        baseline_benchmark_metrics = metrics.evaluate_benchmark_performance(
+            baseline_predictions, ground_truths, task_name, classifier_scorer=classifier_scorer
+        )
+        self.logger.info(f"Baseline performance: {baseline_benchmark_metrics}")
+        # Generate all hyperparameter combinations
+        hyperparameter_combinations = self._generate_hyperparameter_combinations(config)
+        self.logger.info(f"Starting {method_name} optimization with {len(hyperparameter_combinations)} configurations")
+        best_config = None
+        best_score = -1
+        all_results = []
+        for i, (layer, strength, hyperparams) in enumerate(
+            tqdm(hyperparameter_combinations, desc="Optimizing steering hyperparameters")
+        ):
+            self.logger.debug(
+                f"Testing {method_name} config {i + 1}/{len(hyperparameter_combinations)}: "
+                f"layer={layer}, strength={strength}, hyperparams={hyperparams}"
+            )
+            try:
+                # Create method instance
+                method_instance = trainer.create_method_instance(hyperparams, device)
+                # Train the method
+                training_success, training_stats = trainer.train_method(
+                    method_instance, train_samples, layer, model, tokenizer, device, task_name, max_new_tokens
+                )
+                if not training_success:
+                    self.logger.warning(f"Training failed for config {i + 1}")
+                    result = SteeringResult(
+                        method_name=method_name,
+                        layer=layer,
+                        hyperparameters={**hyperparams, "strength": strength},
+                        benchmark_metrics={"accuracy": 0.0},
+                        training_success=False,
+                        training_stats=training_stats,
+                    )
+                    all_results.append(result)
+                    continue
+                # Evaluate on validation data with steering
+                steered_predictions, steered_ground_truths = trainer.apply_steering_and_evaluate(
+                    method_instance,
+                    validation_samples,
+                    layer,
+                    strength,
+                    model,
+                    tokenizer,
+                    device,
+                    batch_size,
+                    max_length,
+                    task_name,
+                    max_new_tokens,
+                )
+                # Compare baseline vs steered predictions using enhanced metrics
+                enhanced_metrics = self.compare_predictions(
+                    baseline_predictions,
+                    steered_predictions,
+                    ground_truths,
+                    model,
+                    tokenizer,
+                    device,
+                    max_length,
+                    task_name,
+                )
+                # Extract steered metrics for compatibility
+                benchmark_metrics = enhanced_metrics["steered"]
+                baseline_metrics_for_result = enhanced_metrics["baseline"]
+                comparative_metrics = enhanced_metrics["improvement"]
+                result = SteeringResult(
+                    method_name=method_name,
+                    layer=layer,
+                    hyperparameters={**hyperparams, "strength": strength},
+                    benchmark_metrics=benchmark_metrics,
+                    baseline_metrics=baseline_metrics_for_result,
+                    comparative_metrics=comparative_metrics,
+                    training_success=True,
+                    training_stats=training_stats,
+                )
+                all_results.append(result)
+                # Standard Optuna practice: optimize steered accuracy directly
+                steered_accuracy = benchmark_metrics.get("accuracy", 0.0)
+                baseline_accuracy = baseline_metrics_for_result.get("accuracy", 0.0)
+                improvement_delta = steered_accuracy - baseline_accuracy
+                if steered_accuracy > best_score:
+                    best_score = steered_accuracy
+                    best_config = {
+                        "method": method_name,
+                        "layer": layer,
+                        "strength": strength,
+                        **hyperparams,
+                        "benchmark_metrics": benchmark_metrics,
+                        "baseline_metrics": baseline_metrics_for_result,
+                        "method_instance": method_instance,
+                    }
+                self.logger.debug(
+                    f"Config {i + 1} - Baseline: {baseline_accuracy:.3f}, "
+                    f"Steered: {steered_accuracy:.3f}, Delta: {improvement_delta:+.3f}"
+                )
+            except Exception as e:
+                self.logger.error(f"Failed to evaluate config {i + 1}: {e}")
+                result = SteeringResult(
+                    method_name=method_name,
+                    layer=layer,
+                    hyperparameters={**hyperparams, "strength": strength},
+                    benchmark_metrics={"accuracy": 0.0},
+                    baseline_metrics=baseline_benchmark_metrics,
+                    comparative_metrics={"accuracy_delta": 0.0, "improvement_rate": 0.0},
+                    training_success=False,
+                    training_stats={"error": str(e)},
+                )
+                all_results.append(result)
+                continue
+        if best_config is None:
+            self.logger.warning("No successful steering configuration found")
+            # Return a default configuration
+            best_config = {
+                "method": method_name,
+                "layer": config.layers[0] if config.layers else 0,
+                "strength": config.strengths[0] if config.strengths else 1.0,
+                "benchmark_metrics": {"accuracy": 0.0},
+                "method_instance": None,
+            }
+        else:
+            steered_acc = best_config["benchmark_metrics"]["accuracy"]
+            baseline_acc = best_config.get("baseline_metrics", {}).get("accuracy", 0.0)
+            improvement = steered_acc - baseline_acc
+            self.logger.info(
+                f"Best {method_name} config (optimized for steered accuracy): "
+                f"layer={best_config['layer']}, steered={steered_acc:.3f} "
+                f"(baseline={baseline_acc:.3f}, Δ={improvement:+.3f})"
+            )
+        return best_config, all_results
+    def _generate_hyperparameter_combinations(
+        self, config: SteeringMethodConfig
+    ) -> List[Tuple[int, float, Dict[str, Any]]]:
+        """Generate all combinations of hyperparameters for grid search."""
+        combinations = []
+        if isinstance(config, DACConfig):
+            # Generate DAC hyperparameter combinations
+            for layer in config.layers:
+                for strength in config.strengths:
+                    for entropy_threshold in config.entropy_thresholds:
+                        for ptop in config.ptop_values:
+                            for max_alpha in config.max_alpha_values:
+                                hyperparams = {
+                                    "entropy_threshold": entropy_threshold,
+                                    "ptop": ptop,
+                                    "max_alpha": max_alpha,
+                                }
+                                combinations.append((layer, strength, hyperparams))
+        else:
+            # Generic handling for other steering methods
+            for layer in config.layers:
+                for strength in config.strengths:
+                    combinations.append((layer, strength, {}))
+        return combinations
+    def collect_baseline_predictions(
+        self,
+        evaluation_samples: List[Dict],
+        model,
+        tokenizer,
+        classifier: Classifier,
+        device: str,
+        batch_size: int,
+        max_length: int,
+        task_name: str,
+        max_new_tokens: int = 200,
+    ) -> Tuple[List[str], List[str]]:
+        """
+        Collect unsteered model predictions for baseline comparison.
+        Uses the same evaluation logic as steered evaluation but without steering hooks.
+        Args:
+            evaluation_samples: Samples to evaluate
+            model: Language model
+            tokenizer: Model tokenizer
+            classifier: Trained classifier for evaluation
+            device: Device to run on
+            batch_size: Batch size for processing
+            max_length: Maximum sequence length
+            task_name: Task name for evaluation
+            max_new_tokens: Maximum tokens to generate
+        Returns:
+            Tuple of (predictions, ground_truths)
+        """
+        predictions = []
+        ground_truths = []
+        # Get the task and its extractor
+        task = get_task(task_name)
+        extractor = task.get_extractor()
+        # Pre-extract all questions and answers (optimization)
+        questions = []
+        answers = []
+        for sample in evaluation_samples:
+            qa_pair = extractor.extract_qa_pair(sample, task)
+            if not qa_pair:
+                self.logger.warning(f"Skipping sample - extractor couldn't extract QA pair: {sample.keys()}")
+                continue
+            questions.append(qa_pair["formatted_question"])
+            answers.append(qa_pair["correct_answer"])
+        # Process questions WITHOUT steering in batches
+        ground_truths.extend(answers)
+        # Process in batches without steering
+        for i in tqdm(range(0, len(questions), batch_size), desc="Generating baseline predictions"):
+            batch_questions = questions[i : i + batch_size]
+            # Tokenize batch with padding for generation
+            inputs = tokenizer(
+                batch_questions, return_tensors="pt", padding=True, truncation=True, max_length=max_length
+            ).to(device)
+            with torch.no_grad():
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=True,
+                    temperature=0.7,
+                    pad_token_id=tokenizer.eos_token_id,
+                    use_cache=False,  # Disable cache to avoid cache_position errors
+                )
+            # Decode responses for each item in batch
+            for j, (output, question) in enumerate(zip(outputs, batch_questions)):
+                response = tokenizer.decode(output, skip_special_tokens=True)
+                prediction = response[len(question) :].strip()
+                predictions.append(prediction)
+        return predictions, ground_truths
+    def _extract_activation_for_text(
+        self,
+        text: str,
+        layer_index: int,
+        aggregation_strategy: str,
+        model,
+        tokenizer,
+        device: str,
+        max_length: int = 512,
+    ) -> torch.Tensor:
+        """
+        Extract activation from text at specified layer with aggregation.
+        Args:
+            text: Input text to extract activation from
+            layer_index: Layer index to extract from
+            aggregation_strategy: Aggregation strategy string (e.g., "mean_pooling")
+            model: Language model
+            tokenizer: Model tokenizer
+            device: Device to run on
+            max_length: Maximum sequence length
+        Returns:
+            Aggregated activation tensor
+        """
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+        activations = []
+        def hook(module, input, output):
+            # Extract hidden states from the layer
+            hidden_states = output[0] if isinstance(output, tuple) else output
+            activations.append(hidden_states.detach().cpu())
+        # Handle different model architectures
+        if hasattr(model, "model") and hasattr(model.model, "layers"):
+            # LLaMA-style models
+            layer_module = model.model.layers[layer_index]
+        elif hasattr(model, "transformer") and hasattr(model.transformer, "h"):
+            # GPT2-style models
+            layer_module = model.transformer.h[layer_index]
+        else:
+            raise ValueError("Unsupported model architecture for activation extraction")
+        # Register hook and run forward pass
+        handle = layer_module.register_forward_hook(hook)
+        try:
+            with torch.no_grad():
+                _ = model(**inputs)
+        finally:
+            handle.remove()
+        if not activations:
+            raise ValueError("No activations extracted")
+        # Get the activation tensor [1, seq_len, hidden_dim]
+        activation_tensor = activations[0]
+        # Apply aggregation strategy
+        if (
+            aggregation_strategy == "mean_pooling"
+            or aggregation_strategy == ActivationAggregationStrategy.MEAN_POOLING.value
+        ):
+            aggregated = torch.mean(activation_tensor, dim=1)  # [1, hidden_dim]
+        elif (
+            aggregation_strategy == "last_token"
+            or aggregation_strategy == ActivationAggregationStrategy.LAST_TOKEN.value
+        ):
+            aggregated = activation_tensor[:, -1, :]  # [1, hidden_dim]
+        elif (
+            aggregation_strategy == "first_token"
+            or aggregation_strategy == ActivationAggregationStrategy.FIRST_TOKEN.value
+        ):
+            aggregated = activation_tensor[:, 0, :]  # [1, hidden_dim]
+        elif (
+            aggregation_strategy == "max_pooling"
+            or aggregation_strategy == ActivationAggregationStrategy.MAX_POOLING.value
+        ):
+            aggregated = torch.max(activation_tensor, dim=1)[0]  # [1, hidden_dim]
+        else:
+            # Default to mean pooling if unknown
+            self.logger.warning(f"Unknown aggregation strategy {aggregation_strategy}, using mean pooling")
+            aggregated = torch.mean(activation_tensor, dim=1)
+        return aggregated.squeeze(0)  # Return [hidden_dim] tensor
+    def score_predictions_with_classifier(
+        self,
+        predictions: List[str],
+        model,
+        tokenizer,
+        device: str,
+        max_length: int = 512,
+        description: str = "predictions",
+    ) -> List[float]:
+        """
+        Score predictions using the cached classifier.
+        This is the core feature that was requested - using the optimized classifier
+        to score unsteered vs steered generations.
+        Args:
+            predictions: Text predictions to score
+            model: Language model for activation extraction
+            tokenizer: Model tokenizer
+            device: Device to run on
+            max_length: Maximum sequence length
+            description: Description for logging
+        Returns:
+            List of classifier scores/probabilities for each prediction
+        """
+        if self._session_classifier is None:
+            self.logger.warning("No cached classifier available for scoring")
+            return [0.5] * len(predictions)  # Return neutral scores
+        if not predictions:
+            self.logger.debug("No predictions to score")
+            return []
+        # Get classifier metadata
+        layer = self._session_classifier_metadata.get("layer", 12)
+        aggregation = self._session_classifier_metadata.get("aggregation", "mean_pooling")
+        self.logger.info(
+            f"Scoring {len(predictions)} {description} with cached classifier (layer={layer}, aggregation={aggregation})"
+        )
+        confidence_scores = []
+        # Process predictions in batches for efficiency
+        batch_size = 8  # Smaller batch size to avoid OOM
+        for i in range(0, len(predictions), batch_size):
+            batch_predictions = predictions[i : i + batch_size]
+            batch_activations = []
+            # Extract activations for each prediction in the batch
+            for pred_text in batch_predictions:
+                try:
+                    # Extract activation for this prediction text
+                    activation = self._extract_activation_for_text(
+                        text=pred_text,
+                        layer_index=layer,
+                        aggregation_strategy=aggregation,
+                        model=model,
+                        tokenizer=tokenizer,
+                        device=device,
+                        max_length=max_length,
+                    )
+                    batch_activations.append(activation)
+                except Exception as e:
+                    self.logger.debug(f"Failed to extract activation for prediction: {e}")
+                    # Use neutral score for failed extractions
+                    confidence_scores.append(0.5)
+                    continue
+            if batch_activations:
+                try:
+                    # Stack activations into batch tensor
+                    batch_tensor = torch.stack(batch_activations)
+                    # Convert to numpy for sklearn classifier
+                    batch_numpy = batch_tensor.detach().cpu().numpy()
+                    # Get prediction probabilities from classifier
+                    probabilities = self._session_classifier.predict_proba(batch_numpy)
+                    # Extract confidence scores (probability for positive class)
+                    # Assuming binary classification with class 1 as positive
+                    if probabilities.shape[1] > 1:
+                        batch_scores = probabilities[:, 1].tolist()  # Probability of positive class
+                    else:
+                        batch_scores = probabilities[:, 0].tolist()  # Single class probability
+                    confidence_scores.extend(batch_scores)
+                except Exception as e:
+                    self.logger.warning(f"Failed to score batch of activations: {e}")
+                    # Add neutral scores for failed batch
+                    confidence_scores.extend([0.5] * len(batch_activations))
+        # Ensure we have scores for all predictions
+        while len(confidence_scores) < len(predictions):
+            confidence_scores.append(0.5)  # Pad with neutral scores if needed
+        # Truncate if we have too many scores (shouldn't happen)
+        confidence_scores = confidence_scores[: len(predictions)]
+        # Log statistics
+        avg_score = sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0.5
+        self.logger.debug(
+            f"Generated {len(confidence_scores)} classifier confidence scores for {description} (avg={avg_score:.3f})"
+        )
+        return confidence_scores
+    def compare_predictions(
+        self,
+        baseline_predictions: List[str],
+        steered_predictions: List[str],
+        ground_truths: List[str],
+        model,
+        tokenizer,
+        device: str,
+        max_length: int = 512,
+        task_name: str = "gsm8k",
+    ) -> Dict[str, Any]:
+        """
+        Compare baseline vs steered predictions using benchmark metrics and classifier scores.
+        Args:
+            baseline_predictions: Unsteered model predictions
+            steered_predictions: Steered model predictions
+            ground_truths: Ground truth answers
+            model: Language model for classifier scoring
+            tokenizer: Model tokenizer
+            device: Device to run on
+            max_length: Maximum sequence length
+            task_name: Task name for evaluation metrics
+        Returns:
+            Enhanced metrics with baseline vs steered comparison including classifier scores
+        """
+        # Create classifier scorer function for metrics integration
+        classifier_scorer = lambda predictions, description: self.score_predictions_with_classifier(
+            predictions, model, tokenizer, device, max_length, description
+        )
+        # Calculate standard benchmark metrics with integrated classifier confidence scores
+        baseline_metrics = metrics.evaluate_benchmark_performance(
+            baseline_predictions, ground_truths, task_name, classifier_scorer=classifier_scorer
+        )
+        steered_metrics = metrics.evaluate_benchmark_performance(
+            steered_predictions, ground_truths, task_name, classifier_scorer=classifier_scorer
+        )
+        # Extract classifier scores from integrated metrics
+        baseline_scores = [
+            detail.get("classifier_confidence", 0.5) for detail in baseline_metrics.get("evaluation_details", [])
+        ]
+        steered_scores = [
+            detail.get("classifier_confidence", 0.5) for detail in steered_metrics.get("evaluation_details", [])
+        ]
+        # Calculate improvement metrics
+        accuracy_delta = steered_metrics.get("accuracy", 0) - baseline_metrics.get("accuracy", 0)
+        f1_delta = steered_metrics.get("f1", 0) - baseline_metrics.get("f1", 0)
+        # Calculate classifier score improvements
+        avg_baseline_score = sum(baseline_scores) / len(baseline_scores) if baseline_scores else 0.0
+        avg_steered_score = sum(steered_scores) / len(steered_scores) if steered_scores else 0.0
+        classifier_score_delta = avg_steered_score - avg_baseline_score
+        return {
+            "baseline": {
+                "accuracy": baseline_metrics.get("accuracy", 0.0),
+                "f1": baseline_metrics.get("f1", 0.0),
+                "classifier_scores": baseline_scores,
+                "avg_classifier_score": avg_baseline_score,
+                "predictions": baseline_predictions,
+            },
+            "steered": {
+                "accuracy": steered_metrics.get("accuracy", 0.0),
+                "f1": steered_metrics.get("f1", 0.0),
+                "classifier_scores": steered_scores,
+                "avg_classifier_score": avg_steered_score,
+                "predictions": steered_predictions,
+            },
+            "improvement": {
+                "accuracy_delta": accuracy_delta,
+                "f1_delta": f1_delta,
+                "classifier_score_delta": classifier_score_delta,
+            },
+        }
+    def load_or_find_best_classifier(
+        self,
+        model,
+        optimization_config: Optional[ClassifierOptimizationConfig] = None,
+        model_name: Optional[str] = None,
+        task_name: Optional[str] = None,
+        contrastive_pairs: Optional[List] = None,
+        force_reoptimize: bool = False,
+    ) -> Optional[Classifier]:
+        """
+        Load or train the best classifier for current steering session.
+        On first call: Run full classifier optimization and cache result for session
+        On subsequent calls: Return cached classifier from current session
+        Args:
+            model: Language model (wisent_guard Model wrapper)
+            optimization_config: Primary configuration source
+            model_name: Fallback model name if optimization_config not provided
+            task_name: Fallback task name if optimization_config not provided
+            contrastive_pairs: Training data for classifier optimization
+            force_reoptimize: Force reoptimization even if session classifier exists
+        Returns:
+            Best trained classifier or None if optimization failed
+        """
+        # Extract configuration
+        if optimization_config is not None:
+            model_name = optimization_config.model_name
+            task_name = getattr(optimization_config, "task_name", task_name)
+            limit = getattr(optimization_config, "data_limit", 100)
+        else:
+            limit = 100  # Default data limit
+        if not model_name or not task_name:
+            raise ValueError("model_name and task_name must be provided either via optimization_config or directly")
+        # Create session cache key
+        session_cache_key = f"{model_name}_{task_name}"
+        # Check if we already have a classifier for this session
+        if (
+            not force_reoptimize
+            and self._session_classifier is not None
+            and self._session_cache_key == session_cache_key
+        ):
+            self.logger.info("Using cached classifier from current session")
+            return self._session_classifier
+        # First call or forced reoptimization - run classifier optimization
+        self.logger.info("Running classifier optimization (first trial in session)")
+        if not contrastive_pairs:
+            self.logger.error("contrastive_pairs required for classifier optimization")
+            return None
+        try:
+            # Create configuration for classifier optimization if not provided
+            if optimization_config is None:
+                optimization_config = ClassifierOptimizationConfig(
+                    model_name=model_name,
+                    device="auto",
+                    n_trials=20,  # Reasonable number for steering optimization
+                    model_types=["logistic", "mlp"],
+                    primary_metric="f1",
+                )
+            # Create generation config for activation pre-generation
+            generation_config = GenerationConfig(
+                layer_search_range=(0, 23),  # Will be auto-detected from model
+                aggregation_methods=[
+                    ActivationAggregationStrategy.MEAN_POOLING,
+                    ActivationAggregationStrategy.LAST_TOKEN,
+                    ActivationAggregationStrategy.FIRST_TOKEN,
+                    ActivationAggregationStrategy.MAX_POOLING,
+                ],
+                cache_dir="./cache/steering_activations",
+                device=optimization_config.device,
+                batch_size=32,
+            )
+            # Create classifier optimizer
+            classifier_optimizer = OptunaClassifierOptimizer(
+                optimization_config=optimization_config,
+                generation_config=generation_config,
+                cache_config=self.classifier_cache.config,
+            )
+            # Run classifier optimization
+            self.logger.info(f"Optimizing classifier for {model_name}/{task_name} with {len(contrastive_pairs)} pairs")
+            result = classifier_optimizer.optimize(
+                model=model,
+                contrastive_pairs=contrastive_pairs,
+                task_name=task_name,
+                model_name=model_name,
+                limit=limit,
+            )
+            if result.best_value > 0:
+                # Get the best configuration and classifier
+                best_config = result.get_best_config()
+                best_classifier = result.best_classifier
+                # Cache for current session
+                self._session_classifier = best_classifier
+                self._session_classifier_metadata = {
+                    "layer": best_config["layer"],
+                    "aggregation": best_config["aggregation"],
+                    "model_type": best_config["model_type"],
+                    "threshold": best_config["threshold"],
+                    "f1_score": result.best_value,
+                    "hyperparameters": best_config.get("hyperparameters", {}),
+                }
+                self._session_cache_key = session_cache_key
+                self.logger.info(
+                    f"Cached best classifier for session: layer_{best_config['layer']} "
+                    f"{best_config['model_type']} (F1: {result.best_value:.3f})"
+                )
+                return best_classifier
+            self.logger.warning("Classifier optimization failed - no successful trials")
+            return None
+        except Exception as e:
+            self.logger.error(f"Failed to run classifier optimization: {e}")
+            traceback.print_exc()
+            return None
+    def get_cache_info(self) -> Dict[str, Any]:
+        """Get information about cached classifiers."""
+        return self.classifier_cache.get_cache_info()
+    def clear_classifier_cache(self, keep_recent_hours: float = 24.0) -> int:
+        """Clear old cached classifiers."""
+        return self.classifier_cache.clear_cache(keep_recent_hours=keep_recent_hours)

wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl