PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/activations/classifier_inference_strategy.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""
+Classifier inference strategies for runtime classification.
+These strategies determine how to extract activations from generated text
+at inference time when classifying responses.
+Based on empirical testing across 3 models (Llama-3.2-1B, Llama-2-7b, Qwen3-8B)
+and 4 tasks (truthfulqa, happy, left_wing, livecodebench):
+Results:
+- last_token: Best performer (77% with chat_last training on truthfulqa)
+- all_mean: Poor (~50%) - dominated by shared prompt tokens
+- all_max/all_min: Poor (~50%)
+- first_token: BROKEN (50%) - BOS token is identical for all inputs
+Recommendation: Use LAST_TOKEN (default) - it works best with chat_last training strategy.
+IMPORTANT: These strategies operate on the FULL sequence (prompt + response).
+At inference time, we typically don't know where the answer starts, so we
+can only use strategies that work on the whole sequence.
+"""
+from enum import Enum
+from typing import Optional
+import argparse
+import torch
+import numpy as np
+class ClassifierInferenceStrategy(str, Enum):
+    """
+    Strategies for extracting activations at inference time for classification.
+    """
+    LAST_TOKEN = "last_token"
+    """Extract activation from the last token only. Best overall performance."""
+    FIRST_TOKEN = "first_token"
+    """Extract activation from the first token only. NOT RECOMMENDED - BOS token has no variance."""
+    ALL_MEAN = "all_mean"
+    """Classify each token, return mean of all scores."""
+    ALL_MAX = "all_max"
+    """Classify each token, return max score (most confident positive)."""
+    ALL_MIN = "all_min"
+    """Classify each token, return min score (most confident negative)."""
+    @property
+    def description(self) -> str:
+        descriptions = {
+            ClassifierInferenceStrategy.LAST_TOKEN: "Last token activation (recommended)",
+            ClassifierInferenceStrategy.FIRST_TOKEN: "First token activation (not recommended)",
+            ClassifierInferenceStrategy.ALL_MEAN: "Mean of all token scores",
+            ClassifierInferenceStrategy.ALL_MAX: "Max of all token scores",
+            ClassifierInferenceStrategy.ALL_MIN: "Min of all token scores",
+        }
+        return descriptions.get(self, "Unknown strategy")
+    @classmethod
+    def default(cls) -> "ClassifierInferenceStrategy":
+        """Return the default strategy (last_token performs best)."""
+        return cls.LAST_TOKEN
+    @classmethod
+    def list_all(cls) -> list[str]:
+        """List all strategy names."""
+        return [s.value for s in cls]
+def extract_inference_activation(
+    strategy: ClassifierInferenceStrategy,
+    hidden_states: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Extract activation for classification at inference time.
+    Args:
+        strategy: The inference strategy to use
+        hidden_states: Hidden states tensor of shape [seq_len, hidden_dim]
+    Returns:
+        Activation vector of shape [hidden_dim]
+    """
+    seq_len = hidden_states.shape[0]
+    if strategy == ClassifierInferenceStrategy.LAST_TOKEN:
+        return hidden_states[-1]
+    elif strategy == ClassifierInferenceStrategy.FIRST_TOKEN:
+        return hidden_states[0]
+    elif strategy == ClassifierInferenceStrategy.ALL_MEAN:
+        return hidden_states.mean(dim=0)
+    elif strategy == ClassifierInferenceStrategy.ALL_MAX:
+        # Token with max norm
+        norms = torch.norm(hidden_states, dim=1)
+        return hidden_states[torch.argmax(norms)]
+    elif strategy == ClassifierInferenceStrategy.ALL_MIN:
+        # Token with min norm
+        norms = torch.norm(hidden_states, dim=1)
+        return hidden_states[torch.argmin(norms)]
+    else:
+        raise ValueError(f"Unknown classifier inference strategy: {strategy}")
+def get_inference_score(
+    classifier,
+    hidden_states: torch.Tensor,
+    strategy: ClassifierInferenceStrategy,
+) -> float:
+    """
+    Get classifier score using the specified inference strategy.
+    For single-token strategies (last_token, first_token), returns the classifier
+    probability for that token.
+    For all_* strategies, classifies each token and aggregates scores.
+    Args:
+        classifier: A trained classifier with predict_proba method
+        hidden_states: Hidden states tensor of shape [seq_len, hidden_dim]
+        strategy: The inference strategy to use
+    Returns:
+        Classification score (probability of positive class)
+    """
+    hidden_np = hidden_states.cpu().float().numpy()
+    seq_len = hidden_np.shape[0]
+    if strategy == ClassifierInferenceStrategy.LAST_TOKEN:
+        return float(classifier.predict_proba([hidden_np[-1]])[0, 1])
+    elif strategy == ClassifierInferenceStrategy.FIRST_TOKEN:
+        return float(classifier.predict_proba([hidden_np[0]])[0, 1])
+    elif strategy in (ClassifierInferenceStrategy.ALL_MEAN,
+                      ClassifierInferenceStrategy.ALL_MAX,
+                      ClassifierInferenceStrategy.ALL_MIN):
+        # Classify all tokens
+        all_scores = []
+        for t in range(seq_len):
+            score = classifier.predict_proba([hidden_np[t]])[0, 1]
+            all_scores.append(score)
+        if strategy == ClassifierInferenceStrategy.ALL_MEAN:
+            return float(np.mean(all_scores))
+        elif strategy == ClassifierInferenceStrategy.ALL_MAX:
+            return float(np.max(all_scores))
+        elif strategy == ClassifierInferenceStrategy.ALL_MIN:
+            return float(np.min(all_scores))
+    raise ValueError(f"Unknown classifier inference strategy: {strategy}")
+def get_recommended_inference_strategy(train_strategy) -> ClassifierInferenceStrategy:
+    """
+    Get the recommended inference strategy for a given training strategy.
+    Based on empirical testing:
+    - chat_last, role_play, mc_balanced -> last_token
+    - chat_mean, chat_weighted, chat_max_norm, chat_first -> all_mean
+    Args:
+        train_strategy: ExtractionStrategy used for training
+    Returns:
+        Recommended ClassifierInferenceStrategy
+    """
+    # Import here to avoid circular dependency
+    from wisent.core.activations.extraction_strategy import ExtractionStrategy
+    if train_strategy in (ExtractionStrategy.CHAT_LAST,
+                          ExtractionStrategy.ROLE_PLAY,
+                          ExtractionStrategy.MC_BALANCED):
+        return ClassifierInferenceStrategy.LAST_TOKEN
+    else:
+        return ClassifierInferenceStrategy.ALL_MEAN
+def add_classifier_inference_strategy_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add --classifier-inference-strategy argument to an argument parser.
+    """
+    parser.add_argument(
+        "--classifier-inference-strategy",
+        type=str,
+        default=ClassifierInferenceStrategy.default().value,
+        choices=ClassifierInferenceStrategy.list_all(),
+        help=f"Inference strategy for classifier. Options: {', '.join(ClassifierInferenceStrategy.list_all())}. Default: {ClassifierInferenceStrategy.default().value}",
+    )

wisent/core/activations/core/atoms.py CHANGED Viewed

@@ -1,60 +1,16 @@
 from __future__ import annotations
-from enum import Enum, auto, unique
 from typing import Mapping, Iterator, TypeAlias
 import numpy as np
 import torch
-import sys
-from wisent.core.errors import UnknownTypeError
-# Python 3.10 compatibility
-if sys.version_info >= (3, 11):
-    from enum import StrEnum
-else:
-    class StrEnum(str, Enum):
-        """StrEnum backport for Python < 3.11"""
-        def _generate_next_value_(name, start, count, last_values):
-            return name.lower()
-        def __str__(self) -> str:
-            return str(self.value)
-__all__ = ["LayerActivations", "ActivationAggregationStrategy", "ActivationCollector", "LayerName", "LayerActivation", "ActivationMap", "RawActivationMap"]
+__all__ = ["LayerActivations", "LayerName", "LayerActivation", "ActivationMap", "RawActivationMap"]
 LayerName: TypeAlias = str
 LayerActivation: TypeAlias = torch.Tensor | None
 ActivationMap: TypeAlias = Mapping[LayerName, LayerActivation]
 RawActivationMap: TypeAlias = Mapping[LayerName, torch.Tensor | np.ndarray | None]
-class _LowerSnakeStrEnum(StrEnum):
-    """StrEnum whose auto() values are lower_snake_case of the member name."""
-    def _generate_next_value_(name, start, count, last_values): # type: ignore
-        return name.lower()
-@unique
-class ActivationAggregationStrategy(_LowerSnakeStrEnum):
-    """Strategies for selecting/aggregating tokens in activation extraction.
-    """
-    CHOICE_TOKEN = auto()         # target A/B choice tokens (multiple choice)
-    CONTINUATION_TOKEN = auto()   # first token of the continuation
-    LAST_TOKEN = auto()           # always use the last token
-    FIRST_TOKEN = auto()          # always use the first token
-    MEAN_POOLING = auto()         # mean over all tokens
-    MAX_POOLING = auto()          # max over all tokens
-    @property
-    def description(self) -> str:
-        return {
-            ActivationAggregationStrategy.CHOICE_TOKEN: "Target A/B choice tokens (multiple choice).",
-            ActivationAggregationStrategy.CONTINUATION_TOKEN: "Use the first token of the continuation.",
-            ActivationAggregationStrategy.LAST_TOKEN: "Always select the last token.",
-            ActivationAggregationStrategy.FIRST_TOKEN: "Always select the first token.",
-            ActivationAggregationStrategy.MEAN_POOLING: "Aggregate by mean over all tokens.",
-            ActivationAggregationStrategy.MAX_POOLING: "Aggregate by max over all tokens.",
-        }[self]
 class LayerActivations(Mapping[LayerName, LayerActivation]):
     """Immutable mapping of layer names to activations.
@@ -72,8 +28,6 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
     atributes:
         _data:
             internal storage dict. It contains information about layer activations.
-        _strategy:
-            'ActivationAggregationStrategy' (see below). Indicates how activations were aggregated if applicable.
     methods:
         'summary()':
@@ -88,13 +42,11 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
             plain dict (useful for (de)serialization).
     examples:
-        >>> acts = LayerActivations({"layer1": torch.randn(2, 10, 768), "layer2": None}, activation_aggregation_strategy="mean_pooling")
+        >>> acts = LayerActivations({"layer1": torch.randn(2, 10, 768), "layer2": None})
         >>> acts["layer1"].shape
         torch.Size([2, 10, 768])
         >>> acts["layer2"] is None
         True
-        >>> acts.activation_aggregation_strategy
-        <ActivationAggregationStrategy.MEAN_POOLING: 'mean_pooling'>
         >>> acts.summary()
         {'layer1': {'shape': (2, 10, 768), 'dtype': 'torch.float32', 'device': 'cpu', 'requires_grad': False}, 'layer2': {'shape': None, 'dtype': None, 'device': None, 'requires_grad': None}}
         >>> acts.numpy()
@@ -104,19 +56,14 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
           layer1: Tensor(shape=(2, 10, 768), dtype=torch.float32, device=cuda:0)
           layer2: None
         )
-        >>> acts.detach()  # if any tensor required grad
-        LayerActivations(
-          layer1: Tensor(shape=(2, 10, 768), dtype=torch.float32, device=cpu)
-          layer2: None
-        )
     notes:
         - Use 'summary()' or 'numpy()' if you need JSON-serializable content.
         - Keys are strings by convention; enforced by type hints.
     """
-    __slots__ = ("_data", "_strategy")
+    __slots__ = ("_data",)
-    def __init__(self, data: RawActivationMap | None = None, activation_aggregation_strategy: ActivationAggregationStrategy | None = None, dtype: torch.dtype | None = None):
+    def __init__(self, data: RawActivationMap | None = None, dtype: torch.dtype | None = None):
         store: dict[LayerName, LayerActivation] = {}
         if data:
             for layer, val in data.items():
@@ -132,33 +79,6 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
                         f"Activations for layer '{layer}' must be torch.Tensor, np.ndarray, or None."
                     )
         self._data = store
-        self._strategy = self._normalize_strategy(activation_aggregation_strategy)
-    @staticmethod
-    def _normalize_strategy(
-        s: ActivationAggregationStrategy | str | None
-    ) -> ActivationAggregationStrategy | None:
-        if s is None:
-            return None
-        if isinstance(s, ActivationAggregationStrategy):
-            return s
-        if isinstance(s, str):
-            try:
-                return ActivationAggregationStrategy(s)
-            except ValueError:
-                valid = [e.value for e in ActivationAggregationStrategy]
-                raise UnknownTypeError(
-                    entity_type="activation_agregation_strategy",
-                    value=s,
-                    valid_values=valid
-                )
-        raise TypeError(
-            "activation_agregation_strategy must be ActivationAggregationStrategy | str | None"
-        )
-    @property
-    def activation_aggregation_strategy(self) -> ActivationAggregationStrategy | None:
-        return self._strategy
     def __getitem__(self, key: LayerName) -> LayerActivation:
         return self._data[key]
@@ -168,10 +88,10 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
         return len(self._data)
     def summary(self) -> dict[LayerName, dict[str, tuple | str | bool | None]]:
-        ''' Return a summary of the activations. For each layer, provides
-        shape, dtype, device, requires_grad status, and aggregation strategy.
-        '''
-        out: dict[LayerName, dict[str, dict[str, tuple | str | bool | None]]] = {}
+        """Return a summary of the activations. For each layer, provides
+        shape, dtype, device, requires_grad status.
+        """
+        out: dict[LayerName, dict[str, tuple | str | bool | None]] = {}
         for k, v in self._data.items():
             if isinstance(v, torch.Tensor):
                 out[k] = {
@@ -182,8 +102,6 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
                 }
             else:
                 out[k] = {"shape": None, "dtype": None, "device": None, "requires_grad": None}
-        out["_activation_aggregation_strategy"] = {"strategy": self._strategy.value if self._strategy else None}
         return out
     def numpy(self) -> dict[LayerName, np.ndarray | None]:
@@ -214,6 +132,4 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
             else:
                 lines.append(f"  {k}: None")
         lines.append(")")
-        lines.append(f"  _activation_aggregation_strategy: {self._strategy.value if self._strategy else None}")
         return "\n".join(lines)

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl