PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/steering_methods/methods/titan.py CHANGED Viewed

@@ -52,18 +52,42 @@ class TITANConfig:
     """Number of directions per layer in the steering manifold."""
     # Layer configuration
-    steering_layers: List[int] = field(default_factory=lambda: [10, 11, 12, 13, 14, 15, 16, 17, 18])
-    """Layer indices where steering can be applied."""
+    steering_layers: Optional[List[int]] = None
+    """Layer indices where steering can be applied. If None, auto-computed from num_layers."""
-    sensor_layer: int = 15
-    """Primary layer for gating decisions."""
+    sensor_layer: Optional[int] = None
+    """Primary layer for gating decisions. If None, auto-computed from num_layers."""
+    num_layers: Optional[int] = None
+    """Total layers in the model. Used to auto-compute steering_layers and sensor_layer."""
+    def resolve_layers(self, num_layers: int) -> None:
+        """Resolve steering_layers and sensor_layer based on model's num_layers."""
+        self.num_layers = num_layers
+        if self.sensor_layer is None:
+            # 75% through the network
+            self.sensor_layer = int(num_layers * 0.75)
+        if self.steering_layers is None:
+            # Middle to late layers (50% to 90% of network)
+            start = int(num_layers * 0.5)
+            end = int(num_layers * 0.9)
+            self.steering_layers = list(range(start, end))
     # Network architecture
-    gate_hidden_dim: int = 128
-    """Hidden dimension for gating network."""
+    gate_hidden_dim: Optional[int] = None
+    """Hidden dimension for gating network. If None, auto-computed as hidden_dim // 16."""
-    intensity_hidden_dim: int = 64
-    """Hidden dimension for intensity network."""
+    intensity_hidden_dim: Optional[int] = None
+    """Hidden dimension for intensity network. If None, auto-computed as hidden_dim // 32."""
+    def resolve_network_dims(self, hidden_dim: int) -> None:
+        """Resolve network dimensions based on model's hidden dimension."""
+        if self.gate_hidden_dim is None:
+            # Scale with model size, but clamp to reasonable range [32, 512]
+            self.gate_hidden_dim = max(32, min(512, hidden_dim // 16))
+        if self.intensity_hidden_dim is None:
+            # Scale with model size, but clamp to reasonable range [16, 256]
+            self.intensity_hidden_dim = max(16, min(256, hidden_dim // 32))
     # Training
     optimization_steps: int = 200
@@ -392,12 +416,15 @@ class TITANMethod(BaseSteeringMethod):
     def __init__(self, **kwargs: Any) -> None:
         super().__init__(**kwargs)
+        # steering_layers and sensor_layer default to None - resolved at training time
+        # based on actual num_layers in the model
         self.config = TITANConfig(
             num_directions=kwargs.get("num_directions", 5),
-            steering_layers=kwargs.get("steering_layers", [10, 11, 12, 13, 14, 15, 16, 17, 18]),
-            sensor_layer=kwargs.get("sensor_layer", 15),
-            gate_hidden_dim=kwargs.get("gate_hidden_dim", 128),
-            intensity_hidden_dim=kwargs.get("intensity_hidden_dim", 64),
+            steering_layers=kwargs.get("steering_layers", None),  # Auto-resolve from num_layers
+            sensor_layer=kwargs.get("sensor_layer", None),  # Auto-resolve from num_layers
+            num_layers=kwargs.get("num_layers", None),
+            gate_hidden_dim=kwargs.get("gate_hidden_dim", None),  # Auto-resolve from hidden_dim
+            intensity_hidden_dim=kwargs.get("intensity_hidden_dim", None),  # Auto-resolve from hidden_dim
             optimization_steps=kwargs.get("optimization_steps", 200),
             learning_rate=kwargs.get("learning_rate", 0.005),
             warmup_steps=kwargs.get("warmup_steps", 20),
@@ -429,8 +456,7 @@ class TITANMethod(BaseSteeringMethod):
             primary_map[layer] = result.get_effective_direction(layer)
         dtype = self.kwargs.get("dtype", None)
-        agg = self.kwargs.get("activation_aggregation_strategy", None)
-        return LayerActivations(primary_map, activation_aggregation_strategy=agg, dtype=dtype)
+        return LayerActivations(primary_map, dtype=dtype)
     def train_titan(self, pair_set: ContrastivePairSet) -> TITANResult:
         """
@@ -448,6 +474,21 @@ class TITANMethod(BaseSteeringMethod):
         if not buckets:
             raise InsufficientDataError(reason="No valid activation pairs found")
+        # Detect num_layers from available data if not set
+        # Find max layer index to determine model size
+        max_layer_idx = 0
+        for layer_name in buckets.keys():
+            try:
+                layer_idx = int(str(layer_name).split("_")[-1])
+                max_layer_idx = max(max_layer_idx, layer_idx)
+            except (ValueError, IndexError):
+                pass
+        # Resolve steering_layers and sensor_layer based on detected num_layers
+        detected_num_layers = max_layer_idx + 1  # layers are 0-indexed
+        if self.config.steering_layers is None or self.config.sensor_layer is None:
+            self.config.resolve_layers(detected_num_layers)
         # Filter to steering layers and determine hidden dim
         layer_names = []
         hidden_dim = None
@@ -472,6 +513,10 @@ class TITANMethod(BaseSteeringMethod):
         if not layer_names or hidden_dim is None:
             raise InsufficientDataError(reason="No valid steering layers found")
+        # Resolve network dimensions based on actual hidden_dim
+        if self.config.gate_hidden_dim is None or self.config.intensity_hidden_dim is None:
+            self.config.resolve_network_dims(hidden_dim)
         num_layers = len(layer_names)
         # Geometry analysis and adaptation

wisent/core/steering_methods/registry.py CHANGED Viewed

@@ -75,6 +75,7 @@ from wisent.core.steering_methods.core.atoms import BaseSteeringMethod
 class SteeringMethodType(Enum):
     """Enumeration of all supported steering methods."""
     CAA = "caa"
+    HYPERPLANE = "hyperplane"
     PRISM = "prism"
     PULSE = "pulse"
     TITAN = "titan"
@@ -190,6 +191,44 @@ CAA_DEFINITION = SteeringMethodDefinition(
 )
+HYPERPLANE_DEFINITION = SteeringMethodDefinition(
+    name="hyperplane",
+    method_type=SteeringMethodType.HYPERPLANE,
+    description="Classifier-based steering using logistic regression decision boundary. Works better than CAA when geometry is orthogonal (each pair has unique direction rather than shared direction).",
+    method_class_path="wisent.core.steering_methods.methods.hyperplane.HyperplaneMethod",
+    parameters=[
+        SteeringMethodParameter(
+            name="normalize",
+            type=bool,
+            default=True,
+            help="L2-normalize the steering vector",
+            action="store_true",
+            cli_flag="--hyperplane-normalize",
+        ),
+        SteeringMethodParameter(
+            name="max_iter",
+            type=int,
+            default=1000,
+            help="Maximum iterations for logistic regression",
+            cli_flag="--hyperplane-max-iter",
+        ),
+        SteeringMethodParameter(
+            name="C",
+            type=float,
+            default=1.0,
+            help="Regularization strength (inverse). Smaller values = stronger regularization.",
+            cli_flag="--hyperplane-C",
+        ),
+    ],
+    optimization_config={
+        "strength_search_range": (0.1, 5.0),
+        "default_strength": 1.0,
+    },
+    default_strength=1.0,
+    strength_range=(0.1, 5.0),
+)
 PRISM_DEFINITION = SteeringMethodDefinition(
     name="prism",
     method_type=SteeringMethodType.PRISM,
@@ -289,15 +328,15 @@ PULSE_DEFINITION = SteeringMethodDefinition(
         SteeringMethodParameter(
             name="sensor_layer",
             type=int,
-            default=15,
-            help="Layer index where condition gating is computed",
+            default=None,
+            help="Layer index where condition gating is computed (auto-computed if not set)",
             cli_flag="--pulse-sensor-layer",
         ),
         SteeringMethodParameter(
             name="steering_layers",
             type=str,
-            default="12,13,14,15,16,17,18",
-            help="Comma-separated layer indices where steering is applied",
+            default=None,
+            help="Comma-separated layer indices where steering is applied (auto-computed if not set)",
             cli_flag="--pulse-steering-layers",
         ),
         SteeringMethodParameter(
@@ -408,29 +447,29 @@ TITAN_DEFINITION = SteeringMethodDefinition(
         SteeringMethodParameter(
             name="steering_layers",
             type=str,
-            default="10,11,12,13,14,15,16,17,18",
-            help="Comma-separated layer indices for steering",
+            default=None,
+            help="Comma-separated layer indices for steering (auto-computed if not set)",
             cli_flag="--titan-steering-layers",
         ),
         SteeringMethodParameter(
             name="sensor_layer",
             type=int,
-            default=15,
-            help="Primary layer for gating decisions",
+            default=None,
+            help="Primary layer for gating decisions (auto-computed if not set)",
             cli_flag="--titan-sensor-layer",
         ),
         SteeringMethodParameter(
             name="gate_hidden_dim",
             type=int,
-            default=128,
-            help="Hidden dimension for gating network",
+            default=None,
+            help="Hidden dimension for gating network (auto-computed as hidden_dim//16 if not set)",
             cli_flag="--titan-gate-hidden-dim",
         ),
         SteeringMethodParameter(
             name="intensity_hidden_dim",
             type=int,
-            default=64,
-            help="Hidden dimension for intensity network",
+            default=None,
+            help="Hidden dimension for intensity network (auto-computed as hidden_dim//32 if not set)",
             cli_flag="--titan-intensity-hidden-dim",
         ),
         SteeringMethodParameter(
@@ -518,6 +557,7 @@ class SteeringMethodRegistry:
     _REGISTRY: Dict[str, SteeringMethodDefinition] = {
         "caa": CAA_DEFINITION,
+        "hyperplane": HYPERPLANE_DEFINITION,
         "prism": PRISM_DEFINITION,
         "pulse": PULSE_DEFINITION,
         "titan": TITAN_DEFINITION,

wisent/core/steering_optimizer.py CHANGED Viewed

@@ -26,8 +26,8 @@ from enum import Enum, auto
 from pathlib import Path
 from .config_manager import ModelConfigManager
-from .activations.core.atoms import ActivationAggregationStrategy
-from .activations.prompt_construction_strategy import PromptConstructionStrategy
+from .activations.extraction_strategy import ExtractionStrategy
 from wisent.core.errors import (
     MissingParameterError,
     SteeringMethodUnknownError,
@@ -60,22 +60,22 @@ class SteeringApplicationConfig:
     gaussian_width: float = 0.2
-def get_default_token_aggregation_strategies() -> List[ActivationAggregationStrategy]:
+def get_default_token_aggregation_strategies() -> List[ExtractionStrategy]:
     """Get token aggregation strategies to test."""
     return [
-        ActivationAggregationStrategy.LAST_TOKEN,
-        ActivationAggregationStrategy.MEAN_POOLING,
-        ActivationAggregationStrategy.FIRST_TOKEN,
-        ActivationAggregationStrategy.MAX_POOLING,
+        ExtractionStrategy.CHAT_LAST,
+        ExtractionStrategy.CHAT_MEAN,
+        ExtractionStrategy.CHAT_FIRST,
+        ExtractionStrategy.CHAT_MAX_NORM,
     ]
-def get_default_prompt_construction_strategies() -> List[PromptConstructionStrategy]:
+def get_default_prompt_construction_strategies() -> List[ExtractionStrategy]:
     """Get prompt construction strategies to test."""
     return [
-        PromptConstructionStrategy.CHAT_TEMPLATE,
-        PromptConstructionStrategy.DIRECT_COMPLETION,
-        PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
+        ExtractionStrategy.CHAT_LAST,
+        ExtractionStrategy.CHAT_LAST,
+        ExtractionStrategy.CHAT_LAST,
     ]
@@ -399,8 +399,8 @@ class SteeringOptimizer:
         methods_to_test: Optional[List[SteeringMethod]] = None,
         layer_range: Optional[str] = None,
         strength_range: Optional[List[float]] = None,
-        token_aggregation_strategies: Optional[List[ActivationAggregationStrategy]] = None,
-        prompt_construction_strategies: Optional[List[PromptConstructionStrategy]] = None,
+        token_aggregation_strategies: Optional[List[ExtractionStrategy]] = None,
+        prompt_construction_strategies: Optional[List[ExtractionStrategy]] = None,
         steering_application_configs: Optional[List[SteeringApplicationConfig]] = None,
         limit: int = 100,
         max_time_minutes: float = 60.0,
@@ -603,8 +603,8 @@ class SteeringOptimizer:
         method: SteeringMethod,
         layer: int,
         strength: float,
-        token_aggregation: ActivationAggregationStrategy,
-        prompt_construction: PromptConstructionStrategy,
+        token_aggregation: ExtractionStrategy,
+        prompt_construction: ExtractionStrategy,
         steering_application: SteeringApplicationConfig,
         limit: int,
         split_ratio: float

wisent/core/synthetic/generators/nonsense_generator.py CHANGED Viewed

@@ -16,16 +16,6 @@ __all__ = [
 class ProgrammaticNonsenseGenerator:
     """Generate nonsense contrastive pairs programmatically without using LLM."""
-    # Word list for word salad mode
-    WORD_LIST = [
-        "purple", "elephant", "calculator", "yesterday", "moon", "basket", "thinking",
-        "telephone", "mountain", "running", "quickly", "tomorrow", "happiness", "keyboard",
-        "window", "dancing", "coffee", "planet", "singing", "computer", "orange", "flying",
-        "bicycle", "dream", "ocean", "pencil", "laughing", "cloud", "table", "walking",
-        "music", "river", "chair", "jumping", "sun", "book", "swimming", "star", "door",
-        "cooking", "tree", "writing", "sky", "flower", "playing", "rain", "paper", "sleeping"
-    ]
     def __init__(
         self,
         nonsense_mode: str,
@@ -46,6 +36,18 @@ class ProgrammaticNonsenseGenerator:
         self.contrastive_set_name = contrastive_set_name
         self.trait_label = trait_label
         self.trait_description = trait_description
+        self._valid_words = None
+    def set_tokenizer(self, tokenizer) -> None:
+        """Extract valid words from tokenizer vocabulary."""
+        vocab = tokenizer.get_vocab()
+        valid_words = []
+        for token, token_id in vocab.items():
+            decoded = tokenizer.decode([token_id])
+            clean = decoded.strip()
+            if clean.isalpha() and len(clean) > 1 and len(clean) < 15:
+                valid_words.append(clean)
+        self._valid_words = list(set(valid_words))
     def generate(self, num_pairs: int = 10) -> ContrastivePairSet:
         """
@@ -108,11 +110,14 @@ class ProgrammaticNonsenseGenerator:
     def _generate_repetitive(self) -> str:
         """Generate pathologically repetitive text."""
+        if self._valid_words is None:
+            raise ValueError("Tokenizer must be set. Call set_tokenizer() first.")
         # Pick a random word or phrase
         choices = [
             random.choice(string.ascii_lowercase),  # Single letter
-            random.choice(self.WORD_LIST),  # Single word
-            ' '.join(random.sample(self.WORD_LIST, 2)),  # Two-word phrase
+            random.choice(self._valid_words),  # Single word
+            ' '.join(random.sample(self._valid_words, 2)),  # Two-word phrase
         ]
         unit = random.choice(choices)
@@ -121,13 +126,20 @@ class ProgrammaticNonsenseGenerator:
         return ' '.join([unit] * repetitions)
     def _generate_word_salad(self) -> str:
-        """Generate word salad (real words, no meaning)."""
-        num_words = random.randint(8, 15)
-        words = random.choices(self.WORD_LIST, k=num_words)
-        return ' '.join(words)
+        """Generate word salad (random tokens from tokenizer vocabulary)."""
+        num_words = random.randint(3, 10)
+        if self._valid_words is not None:
+            words = random.choices(self._valid_words, k=num_words)
+            return ' '.join(words)
+        raise ValueError("Tokenizer must be set to generate word salad. Call set_tokenizer() first.")
     def _generate_mixed(self) -> str:
         """Generate mixed nonsense (combination of all types)."""
+        if self._valid_words is None:
+            raise ValueError("Tokenizer must be set. Call set_tokenizer() first.")
         components = []
         # Add 2-4 different types of nonsense
@@ -140,11 +152,11 @@ class ProgrammaticNonsenseGenerator:
                 length = random.randint(5, 15)
                 components.append(''.join(random.choices(string.ascii_lowercase, k=length)))
             elif mode == 'repetitive':
-                word = random.choice(self.WORD_LIST)
+                word = random.choice(self._valid_words)
                 reps = random.randint(3, 6)
                 components.append(' '.join([word] * reps))
             else:  # word_salad
                 num_words = random.randint(3, 6)
-                components.append(' '.join(random.choices(self.WORD_LIST, k=num_words)))
+                components.append(' '.join(random.choices(self._valid_words, k=num_words)))
         return ' '.join(components)

wisent/core/trainers/steering_trainer.py CHANGED Viewed

@@ -10,9 +10,9 @@ import datetime as _dt
 from wisent.core.activations.core.atoms import (
     LayerActivations,
-    ActivationAggregationStrategy,
     RawActivationMap,
 )
+from wisent.core.activations.extraction_strategy import ExtractionStrategy
 from wisent.core.models.wisent_model import WisentModel
 from wisent.core.trainers.core.atoms import (
@@ -48,8 +48,8 @@ class WisentSteeringTrainer(BaseSteeringTrainer):
         model: WisentModel to use for activation collection.
         pair_set: ContrastivePairSet with pairs to use for collection and training.
         steering_method: BaseSteeringMethod instance to use for training.
-        store_device: Device to store collected activations on (default "cpu").
-        dtype: Optional torch.dtype to cast collected activations to (default None, meaning no cast).
+        store_device: Device to store collected activations on (default: "cpu" to avoid GPU OOM).
+        dtype: Optional torch.dtype to cast collected activations to.
     """
     model: WisentModel
@@ -66,8 +66,7 @@ class WisentSteeringTrainer(BaseSteeringTrainer):
         self,
         layers_spec: Sequence[str] | str | int | Sequence[int] | None,
         method_kwargs: dict[str, Any] | None = None,
-        aggregation: ActivationAggregationStrategy = ActivationAggregationStrategy.CONTINUATION_TOKEN,
-        return_full_sequence: bool = False,
+        strategy: ExtractionStrategy = ExtractionStrategy.CHAT_LAST,
         normalize_layers: bool = False,
         save_dir: str | Path | None = None,
         accept_low_quality_vector: bool = False,
@@ -87,16 +86,10 @@ class WisentSteeringTrainer(BaseSteeringTrainer):
                 - range string "10-30" / "10..30"
                 - single int "12"
                 - None → use all available layers on the model
-            method:
-                Name of steering method ("caa").
             method_kwargs:
                 Dict of hyperparameters for the method (e.g., {"normalize": True, "scale": 1.0}).
-            aggregation:
-                ActivationAggregationStrategy to use during collection when not returning
-                full sequences. Ignored if 'return_full_sequence=True'.
-            return_full_sequence:
-                If True, store full [T,H] sequences per layer (method then must know how
-                to collapse to vectors). Default False (collect [H] vectors directly).
+            strategy:
+                ExtractionStrategy to use during collection.
             normalize_layers:
                 If True, L2-normalize activations layer-wise during collection.
             save_dir:
@@ -112,12 +105,11 @@ class WisentSteeringTrainer(BaseSteeringTrainer):
         # 2) Collect activations for each pair
         for i, pair in enumerate(self.pair_set.pairs):
-            updated = self.collector.collect_for_pair(
+            updated = self.collector.collect(
                 pair,
+                strategy=strategy,
                 layers=layers,
-                aggregation=aggregation,
-                return_full_sequence=return_full_sequence,
-                normalize_layers=normalize_layers,
+                normalize=normalize_layers,
             )
             self.pair_set.pairs[i] = updated
@@ -221,8 +213,7 @@ class WisentSteeringTrainer(BaseSteeringTrainer):
             "layers_used": layers or "all",
             "method": self.steering_method.name,
             "method_kwargs": method_kwargs,
-            "activation_aggregation_strategy": (None if return_full_sequence else aggregation),
-            "return_full_sequence": bool(return_full_sequence),
+            "extraction_strategy": strategy.value,
             "normalize_layers": bool(normalize_layers),
             "num_pairs": len(self.pair_set.pairs),
             "hidden_size": getattr(self.model, "hidden_size", None),
@@ -290,7 +281,7 @@ class WisentSteeringTrainer(BaseSteeringTrainer):
         # Vectors
         raw_map: RawActivationMap = result.steered_vectors.to_dict()  # still tensors
-        cpu_map = {k: (v.detach().to("cpu") if isinstance(v, torch.Tensor) else v) for k, v in raw_map.items() if k != "_activation_aggregation_strategy"}
+        cpu_map = {k: (v.detach().to("cpu") if isinstance(v, torch.Tensor) else v) for k, v in raw_map.items()}
         torch.save(cpu_map, out / "steering_vectors.pt")
         # Summary (json-serializable)

wisent/core/utils/device.py CHANGED Viewed

@@ -95,23 +95,19 @@ def preferred_dtype(kind: DeviceKind | None = None) -> torch.dtype:
     """
     Return the preferred dtype for model loading.
-    Default is float32 for consistency across all devices. This ensures steering
-    vectors trained on one device work identically on another.
+    Default is device-optimized dtype (bfloat16 on CUDA, float16 on MPS, float32 on CPU).
     Priority:
     1. Global override set via set_default_dtype()
     2. WISENT_DTYPE environment variable ("float32", "float16", "bfloat16", "auto")
-    3. Default: float32 (consistent across all devices)
+    3. Default: device-optimized (bfloat16 on CUDA, float16 on MPS, float32 on CPU)
-    To use device-optimized dtypes for better performance (at cost of cross-device
-    consistency), set WISENT_DTYPE=auto or call set_default_dtype("auto").
     Example:
-        >>> preferred_dtype()  # Always float32 by default
-        torch.float32
-        >>> set_default_dtype("auto")  # Use device-optimized dtypes
-        >>> preferred_dtype()  # Now bfloat16 on CUDA, float16 on MPS
+        >>> preferred_dtype()  # bfloat16 on CUDA, float16 on MPS, float32 on CPU
         torch.bfloat16
+        >>> set_default_dtype("float32")  # Force float32 everywhere
+        >>> preferred_dtype()
+        torch.float32
     """
     # Check global override first
     if _global_dtype_override is not None:
@@ -126,8 +122,8 @@ def preferred_dtype(kind: DeviceKind | None = None) -> torch.dtype:
             return device_optimized_dtype(kind)
         return env_dtype
-    # Default: float32 for consistency across all devices
-    return torch.float32
+    # Default: use device-optimized dtype for best performance
+    return device_optimized_dtype(kind)
 def device_optimized_dtype(kind: DeviceKind | None = None) -> torch.dtype:
@@ -159,8 +155,14 @@ def device_optimized_dtype(kind: DeviceKind | None = None) -> torch.dtype:
 # Steering Vector dtype utilities
 # ============================================================================
-# Default dtype for storing steering vectors (float32 for cross-device compatibility)
-STEERING_VECTOR_DTYPE = torch.float32
+def steering_vector_dtype() -> torch.dtype:
+    """Return the dtype for steering vectors (uses preferred_dtype())."""
+    return preferred_dtype()
+# Legacy constant for backward compatibility - use steering_vector_dtype() instead
+STEERING_VECTOR_DTYPE = torch.float32  # Deprecated: kept for backward compat only
 def save_steering_vector(
@@ -172,10 +174,7 @@ def save_steering_vector(
     metadata: dict | None = None,
 ) -> None:
     """
-    Save a steering vector with dtype metadata for cross-device compatibility.
-    Vectors are always stored in float32 for consistency across devices,
-    but the original dtype is preserved in metadata for reference.
+    Save a steering vector with dtype metadata.
     Args:
         path: File path to save to (.pt)
@@ -187,21 +186,22 @@ def save_steering_vector(
     """
     # Store original dtype before conversion
     original_dtype = vector.dtype
+    storage_dtype = steering_vector_dtype()
-    # Convert to float32 for cross-device compatibility
-    vector_f32 = vector.to(dtype=STEERING_VECTOR_DTYPE, device="cpu")
+    # Store in preferred dtype
+    vector_stored = vector.to(dtype=storage_dtype, device="cpu")
     save_data = {
         # Primary data
-        "steering_vector": vector_f32,
+        "steering_vector": vector_stored,
         "layer": layer,
         "model": model_name,
         "method": method,
         # Dtype metadata
         "original_dtype": str(original_dtype),
-        "storage_dtype": str(STEERING_VECTOR_DTYPE),
+        "storage_dtype": str(storage_dtype),
         # Legacy keys for backward compatibility
-        "vector": vector_f32,
+        "vector": vector_stored,
         "layer_index": layer,
     }
@@ -233,7 +233,8 @@ def load_steering_vector(
         - "original_dtype": Original dtype when saved
         - "metadata": Any additional metadata
     """
-    data = torch.load(path, map_location="cpu", weights_only=False)
+    target_device = device or resolve_default_device()
+    data = torch.load(path, map_location=target_device, weights_only=False)
     # Get the vector (support both old and new key names)
     vector = data.get("steering_vector") or data.get("vector")
@@ -241,10 +242,9 @@ def load_steering_vector(
         raise FileLoadError(file_path=str(path), reason="No steering vector found")
     # Determine target dtype
-    target_dtype = dtype or preferred_dtype(device)
-    target_device = device or resolve_default_device()
+    target_dtype = dtype or preferred_dtype(target_device)
-    # Convert to target dtype/device
+    # Convert to target dtype/device if needed
     vector = vector.to(dtype=target_dtype, device=target_device)
     return {

wisent/core/utils/layer_combinations.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Generate all layer combinations for hyperparameter search."""
+from itertools import combinations
+from math import comb
+from typing import List
+def get_layer_combinations(num_layers: int, max_combo_size: int, single_and_all_only: bool = True) -> List[List[int]]:
+    """
+    Generate layer combinations up to a maximum combination size.
+    Args:
+        num_layers: Total number of layers in the model
+        max_combo_size: Maximum number of layers in a combination (e.g., 3)
+        single_and_all_only: If True, only return single layers and all layers together
+                             (skip 2-layer, 3-layer combinations). Default: True
+    Returns:
+        List of layer combinations:
+        - All layers together: [0, 1, 2, ..., num_layers-1]
+        - All individual layers: [0], [1], ..., [num_layers-1]
+        - (if not single_and_all_only) All combinations of 2, 3, ..., max_combo_size layers
+    """
+    all_layers = list(range(num_layers))
+    result = []
+    # All layers together (always included)
+    result.append(all_layers)
+    # All individual layers
+    for layer in all_layers:
+        result.append([layer])
+    # All combinations of 2, 3, ..., max_combo_size layers (unless single_and_all_only)
+    if not single_and_all_only:
+        for r in range(2, max_combo_size + 1):
+            for combo in combinations(all_layers, r):
+                result.append(list(combo))
+    return result
+def get_layer_combinations_count(num_layers: int, max_combo_size: int) -> int:
+    """
+    Calculate total number of layer combinations without generating them.
+    Total = 1 (all layers) + C(n,1) + C(n,2) + ... + C(n, max_combo_size)
+    """
+    total = 1  # all layers
+    for r in range(1, max_combo_size + 1):
+        total += comb(num_layers, r)
+    return total
+if __name__ == "__main__":
+    # Test with 16 layers (like Llama-3.2-1B) and max_combo_size=3
+    num_layers = 16
+    max_combo_size = 3
+    combos = get_layer_combinations(num_layers, max_combo_size)
+    print(f"Model with {num_layers} layers, max_combo_size={max_combo_size}:")
+    print(f"Total combinations: {len(combos)}")
+    print(f"Expected: {get_layer_combinations_count(num_layers, max_combo_size)}")
+    print()
+    print("First 20 combinations:")
+    for i, combo in enumerate(combos[:20]):
+        print(f"  {i+1}: {combo}")
+    if len(combos) > 20:
+        print(f"  ... and {len(combos) - 20} more")

wisent/examples/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Wisent examples

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl