PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (725) hide show

wisent/core/activations/classifier_inference_strategy.py ADDED Viewed

@@ -0,0 +1,194 @@
+"""
+Classifier inference strategies for runtime classification.
+These strategies determine how to extract activations from generated text
+at inference time when classifying responses.
+Based on empirical testing across 3 models (Llama-3.2-1B, Llama-2-7b, Qwen3-8B)
+and 4 tasks (truthfulqa, happy, left_wing, livecodebench):
+Results:
+- last_token: 66.3% avg accuracy (94.4% when paired with chat_last training)
+- all_mean: 65.9% avg accuracy
+- all_min: 53.5% avg accuracy
+- all_max: 53.3% avg accuracy
+- first_token: 50.0% avg accuracy (completely useless - BOS token is identical for all inputs)
+Recommendation: Use LAST_TOKEN (default) - it works best with chat_last training strategy.
+"""
+from enum import Enum
+from typing import Optional
+import argparse
+import torch
+import numpy as np
+class ClassifierInferenceStrategy(str, Enum):
+    """
+    Strategies for extracting activations at inference time for classification.
+    """
+    LAST_TOKEN = "last_token"
+    """Extract activation from the last token only. Best overall performance."""
+    FIRST_TOKEN = "first_token"
+    """Extract activation from the first token only. NOT RECOMMENDED - BOS token has no variance."""
+    ALL_MEAN = "all_mean"
+    """Classify each token, return mean of all scores."""
+    ALL_MAX = "all_max"
+    """Classify each token, return max score (most confident positive)."""
+    ALL_MIN = "all_min"
+    """Classify each token, return min score (most confident negative)."""
+    @property
+    def description(self) -> str:
+        descriptions = {
+            ClassifierInferenceStrategy.LAST_TOKEN: "Last token activation (recommended)",
+            ClassifierInferenceStrategy.FIRST_TOKEN: "First token activation (not recommended)",
+            ClassifierInferenceStrategy.ALL_MEAN: "Mean of all token scores",
+            ClassifierInferenceStrategy.ALL_MAX: "Max of all token scores",
+            ClassifierInferenceStrategy.ALL_MIN: "Min of all token scores",
+        }
+        return descriptions.get(self, "Unknown strategy")
+    @classmethod
+    def default(cls) -> "ClassifierInferenceStrategy":
+        """Return the default strategy (last_token performs best)."""
+        return cls.LAST_TOKEN
+    @classmethod
+    def list_all(cls) -> list[str]:
+        """List all strategy names."""
+        return [s.value for s in cls]
+def extract_inference_activation(
+    strategy: ClassifierInferenceStrategy,
+    hidden_states: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Extract activation for classification at inference time.
+    Args:
+        strategy: The inference strategy to use
+        hidden_states: Hidden states tensor of shape [seq_len, hidden_dim]
+    Returns:
+        Activation vector of shape [hidden_dim]
+    """
+    seq_len = hidden_states.shape[0]
+    if strategy == ClassifierInferenceStrategy.LAST_TOKEN:
+        return hidden_states[-1]
+    elif strategy == ClassifierInferenceStrategy.FIRST_TOKEN:
+        return hidden_states[0]
+    elif strategy == ClassifierInferenceStrategy.ALL_MEAN:
+        return hidden_states.mean(dim=0)
+    elif strategy == ClassifierInferenceStrategy.ALL_MAX:
+        # Token with max norm
+        norms = torch.norm(hidden_states, dim=1)
+        return hidden_states[torch.argmax(norms)]
+    elif strategy == ClassifierInferenceStrategy.ALL_MIN:
+        # Token with min norm
+        norms = torch.norm(hidden_states, dim=1)
+        return hidden_states[torch.argmin(norms)]
+    else:
+        # Default fallback
+        return hidden_states[-1]
+def get_inference_score(
+    classifier,
+    hidden_states: torch.Tensor,
+    strategy: ClassifierInferenceStrategy,
+) -> float:
+    """
+    Get classifier score using the specified inference strategy.
+    For single-token strategies (last_token, first_token), returns the classifier
+    probability for that token.
+    For all_* strategies, classifies each token and aggregates scores.
+    Args:
+        classifier: A trained classifier with predict_proba method
+        hidden_states: Hidden states tensor of shape [seq_len, hidden_dim]
+        strategy: The inference strategy to use
+    Returns:
+        Classification score (probability of positive class)
+    """
+    hidden_np = hidden_states.cpu().float().numpy()
+    seq_len = hidden_np.shape[0]
+    if strategy == ClassifierInferenceStrategy.LAST_TOKEN:
+        return float(classifier.predict_proba([hidden_np[-1]])[0, 1])
+    elif strategy == ClassifierInferenceStrategy.FIRST_TOKEN:
+        return float(classifier.predict_proba([hidden_np[0]])[0, 1])
+    elif strategy in (ClassifierInferenceStrategy.ALL_MEAN,
+                      ClassifierInferenceStrategy.ALL_MAX,
+                      ClassifierInferenceStrategy.ALL_MIN):
+        # Classify all tokens
+        all_scores = []
+        for t in range(seq_len):
+            score = classifier.predict_proba([hidden_np[t]])[0, 1]
+            all_scores.append(score)
+        if strategy == ClassifierInferenceStrategy.ALL_MEAN:
+            return float(np.mean(all_scores))
+        elif strategy == ClassifierInferenceStrategy.ALL_MAX:
+            return float(np.max(all_scores))
+        elif strategy == ClassifierInferenceStrategy.ALL_MIN:
+            return float(np.min(all_scores))
+    # Default fallback
+    return float(classifier.predict_proba([hidden_np[-1]])[0, 1])
+def get_recommended_inference_strategy(train_strategy) -> ClassifierInferenceStrategy:
+    """
+    Get the recommended inference strategy for a given training strategy.
+    Based on empirical testing:
+    - chat_last, role_play, mc_balanced -> last_token (94.4%, 72.4%, 60.2%)
+    - chat_mean, chat_weighted, chat_max_norm, chat_first, chat_gen_point -> all_mean
+    Args:
+        train_strategy: ExtractionStrategy used for training
+    Returns:
+        Recommended ClassifierInferenceStrategy
+    """
+    # Import here to avoid circular dependency
+    from wisent.core.activations.extraction_strategy import ExtractionStrategy
+    if train_strategy in (ExtractionStrategy.CHAT_LAST,
+                          ExtractionStrategy.ROLE_PLAY,
+                          ExtractionStrategy.MC_BALANCED):
+        return ClassifierInferenceStrategy.LAST_TOKEN
+    else:
+        return ClassifierInferenceStrategy.ALL_MEAN
+def add_classifier_inference_strategy_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add --classifier-inference-strategy argument to an argument parser.
+    """
+    parser.add_argument(
+        "--classifier-inference-strategy",
+        type=str,
+        default=ClassifierInferenceStrategy.default().value,
+        choices=ClassifierInferenceStrategy.list_all(),
+        help=f"Inference strategy for classifier. Options: {', '.join(ClassifierInferenceStrategy.list_all())}. Default: {ClassifierInferenceStrategy.default().value}",
+    )

wisent/core/activations/core/atoms.py CHANGED Viewed

@@ -1,60 +1,16 @@
 from __future__ import annotations
-from enum import Enum, auto, unique
 from typing import Mapping, Iterator, TypeAlias
 import numpy as np
 import torch
-import sys
-from wisent.core.errors import UnknownTypeError
-# Python 3.10 compatibility
-if sys.version_info >= (3, 11):
-    from enum import StrEnum
-else:
-    class StrEnum(str, Enum):
-        """StrEnum backport for Python < 3.11"""
-        def _generate_next_value_(name, start, count, last_values):
-            return name.lower()
-        def __str__(self) -> str:
-            return str(self.value)
-__all__ = ["LayerActivations", "ActivationAggregationStrategy", "ActivationCollector", "LayerName", "LayerActivation", "ActivationMap", "RawActivationMap"]
+__all__ = ["LayerActivations", "LayerName", "LayerActivation", "ActivationMap", "RawActivationMap"]
 LayerName: TypeAlias = str
 LayerActivation: TypeAlias = torch.Tensor | None
 ActivationMap: TypeAlias = Mapping[LayerName, LayerActivation]
 RawActivationMap: TypeAlias = Mapping[LayerName, torch.Tensor | np.ndarray | None]
-class _LowerSnakeStrEnum(StrEnum):
-    """StrEnum whose auto() values are lower_snake_case of the member name."""
-    def _generate_next_value_(name, start, count, last_values): # type: ignore
-        return name.lower()
-@unique
-class ActivationAggregationStrategy(_LowerSnakeStrEnum):
-    """Strategies for selecting/aggregating tokens in activation extraction.
-    """
-    CHOICE_TOKEN = auto()         # target A/B choice tokens (multiple choice)
-    CONTINUATION_TOKEN = auto()   # first token of the continuation
-    LAST_TOKEN = auto()           # always use the last token
-    FIRST_TOKEN = auto()          # always use the first token
-    MEAN_POOLING = auto()         # mean over all tokens
-    MAX_POOLING = auto()          # max over all tokens
-    @property
-    def description(self) -> str:
-        return {
-            ActivationAggregationStrategy.CHOICE_TOKEN: "Target A/B choice tokens (multiple choice).",
-            ActivationAggregationStrategy.CONTINUATION_TOKEN: "Use the first token of the continuation.",
-            ActivationAggregationStrategy.LAST_TOKEN: "Always select the last token.",
-            ActivationAggregationStrategy.FIRST_TOKEN: "Always select the first token.",
-            ActivationAggregationStrategy.MEAN_POOLING: "Aggregate by mean over all tokens.",
-            ActivationAggregationStrategy.MAX_POOLING: "Aggregate by max over all tokens.",
-        }[self]
 class LayerActivations(Mapping[LayerName, LayerActivation]):
     """Immutable mapping of layer names to activations.
@@ -72,8 +28,6 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
     atributes:
         _data:
             internal storage dict. It contains information about layer activations.
-        _strategy:
-            'ActivationAggregationStrategy' (see below). Indicates how activations were aggregated if applicable.
     methods:
         'summary()':
@@ -88,13 +42,11 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
             plain dict (useful for (de)serialization).
     examples:
-        >>> acts = LayerActivations({"layer1": torch.randn(2, 10, 768), "layer2": None}, activation_aggregation_strategy="mean_pooling")
+        >>> acts = LayerActivations({"layer1": torch.randn(2, 10, 768), "layer2": None})
         >>> acts["layer1"].shape
         torch.Size([2, 10, 768])
         >>> acts["layer2"] is None
         True
-        >>> acts.activation_aggregation_strategy
-        <ActivationAggregationStrategy.MEAN_POOLING: 'mean_pooling'>
         >>> acts.summary()
         {'layer1': {'shape': (2, 10, 768), 'dtype': 'torch.float32', 'device': 'cpu', 'requires_grad': False}, 'layer2': {'shape': None, 'dtype': None, 'device': None, 'requires_grad': None}}
         >>> acts.numpy()
@@ -104,19 +56,14 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
           layer1: Tensor(shape=(2, 10, 768), dtype=torch.float32, device=cuda:0)
           layer2: None
         )
-        >>> acts.detach()  # if any tensor required grad
-        LayerActivations(
-          layer1: Tensor(shape=(2, 10, 768), dtype=torch.float32, device=cpu)
-          layer2: None
-        )
     notes:
         - Use 'summary()' or 'numpy()' if you need JSON-serializable content.
         - Keys are strings by convention; enforced by type hints.
     """
-    __slots__ = ("_data", "_strategy")
+    __slots__ = ("_data",)
-    def __init__(self, data: RawActivationMap | None = None, activation_aggregation_strategy: ActivationAggregationStrategy | None = None, dtype: torch.dtype | None = None):
+    def __init__(self, data: RawActivationMap | None = None, dtype: torch.dtype | None = None):
         store: dict[LayerName, LayerActivation] = {}
         if data:
             for layer, val in data.items():
@@ -132,33 +79,6 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
                         f"Activations for layer '{layer}' must be torch.Tensor, np.ndarray, or None."
                     )
         self._data = store
-        self._strategy = self._normalize_strategy(activation_aggregation_strategy)
-    @staticmethod
-    def _normalize_strategy(
-        s: ActivationAggregationStrategy | str | None
-    ) -> ActivationAggregationStrategy | None:
-        if s is None:
-            return None
-        if isinstance(s, ActivationAggregationStrategy):
-            return s
-        if isinstance(s, str):
-            try:
-                return ActivationAggregationStrategy(s)
-            except ValueError:
-                valid = [e.value for e in ActivationAggregationStrategy]
-                raise UnknownTypeError(
-                    entity_type="activation_agregation_strategy",
-                    value=s,
-                    valid_values=valid
-                )
-        raise TypeError(
-            "activation_agregation_strategy must be ActivationAggregationStrategy | str | None"
-        )
-    @property
-    def activation_aggregation_strategy(self) -> ActivationAggregationStrategy | None:
-        return self._strategy
     def __getitem__(self, key: LayerName) -> LayerActivation:
         return self._data[key]
@@ -168,10 +88,10 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
         return len(self._data)
     def summary(self) -> dict[LayerName, dict[str, tuple | str | bool | None]]:
-        ''' Return a summary of the activations. For each layer, provides
-        shape, dtype, device, requires_grad status, and aggregation strategy.
-        '''
-        out: dict[LayerName, dict[str, dict[str, tuple | str | bool | None]]] = {}
+        """Return a summary of the activations. For each layer, provides
+        shape, dtype, device, requires_grad status.
+        """
+        out: dict[LayerName, dict[str, tuple | str | bool | None]] = {}
         for k, v in self._data.items():
             if isinstance(v, torch.Tensor):
                 out[k] = {
@@ -182,8 +102,6 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
                 }
             else:
                 out[k] = {"shape": None, "dtype": None, "device": None, "requires_grad": None}
-        out["_activation_aggregation_strategy"] = {"strategy": self._strategy.value if self._strategy else None}
         return out
     def numpy(self) -> dict[LayerName, np.ndarray | None]:
@@ -214,6 +132,4 @@ class LayerActivations(Mapping[LayerName, LayerActivation]):
             else:
                 lines.append(f"  {k}: None")
         lines.append(")")
-        lines.append(f"  _activation_aggregation_strategy: {self._strategy.value if self._strategy else None}")
         return "\n".join(lines)

wisent/core/activations/extraction_strategy.py ADDED Viewed

@@ -0,0 +1,308 @@
+"""
+Unified extraction strategies for activation collection.
+These strategies combine prompt construction and token extraction into a single
+unified approach, based on empirical testing of what actually works.
+The strategies are:
+- chat_mean: Chat template prompt, mean of answer tokens
+- chat_first: Chat template prompt, first answer token
+- chat_last: Chat template prompt, last token
+- chat_gen_point: Chat template prompt, token before answer (generation decision point)
+- chat_max_norm: Chat template prompt, token with max norm in answer
+- chat_weighted: Chat template prompt, position-weighted mean (earlier tokens weighted more)
+- role_play: "Behave like person who answers Q with A" format, last token
+- mc_balanced: Multiple choice with balanced A/B assignment, last token
+"""
+from enum import Enum
+from typing import Tuple, Optional
+import argparse
+import torch
+class ExtractionStrategy(str, Enum):
+    """
+    Unified extraction strategies combining prompt format and token selection.
+    These replace the old separate PromptConstructionStrategy and ActivationAggregationStrategy.
+    """
+    CHAT_MEAN = "chat_mean"
+    """Chat template prompt with Q+A, extract mean of answer tokens."""
+    CHAT_FIRST = "chat_first"
+    """Chat template prompt with Q+A, extract first answer token."""
+    CHAT_LAST = "chat_last"
+    """Chat template prompt with Q+A, extract last token."""
+    CHAT_GEN_POINT = "chat_gen_point"
+    """Chat template prompt with Q+A, extract token before answer starts (decision point)."""
+    CHAT_MAX_NORM = "chat_max_norm"
+    """Chat template prompt with Q+A, extract token with max norm in answer region."""
+    CHAT_WEIGHTED = "chat_weighted"
+    """Chat template prompt with Q+A, position-weighted mean (earlier tokens weighted more)."""
+    ROLE_PLAY = "role_play"
+    """'Behave like person who answers Q with A' format, extract last token."""
+    MC_BALANCED = "mc_balanced"
+    """Multiple choice format with balanced A/B assignment, extract last token."""
+    @property
+    def description(self) -> str:
+        descriptions = {
+            ExtractionStrategy.CHAT_MEAN: "Chat template with mean of answer tokens",
+            ExtractionStrategy.CHAT_FIRST: "Chat template with first answer token",
+            ExtractionStrategy.CHAT_LAST: "Chat template with last token",
+            ExtractionStrategy.CHAT_GEN_POINT: "Chat template with generation decision point",
+            ExtractionStrategy.CHAT_MAX_NORM: "Chat template with max-norm answer token",
+            ExtractionStrategy.CHAT_WEIGHTED: "Chat template with position-weighted mean",
+            ExtractionStrategy.ROLE_PLAY: "Role-playing format with last token",
+            ExtractionStrategy.MC_BALANCED: "Balanced multiple choice with last token",
+        }
+        return descriptions.get(self, "Unknown strategy")
+    @classmethod
+    def default(cls) -> "ExtractionStrategy":
+        """Return the default strategy (chat_last is most commonly used)."""
+        return cls.CHAT_LAST
+    @classmethod
+    def list_all(cls) -> list[str]:
+        """List all strategy names."""
+        return [s.value for s in cls]
+# Random tokens for role_play strategy (deterministic based on prompt hash)
+ROLE_PLAY_TOKENS = ["I", "Well", "The", "Sure", "Let", "That", "It", "This", "My", "To"]
+def build_extraction_texts(
+    strategy: ExtractionStrategy,
+    prompt: str,
+    response: str,
+    tokenizer,
+    other_response: Optional[str] = None,
+    is_positive: bool = True,
+) -> Tuple[str, str, Optional[str]]:
+    """
+    Build the full text for activation extraction based on strategy.
+    Args:
+        strategy: The extraction strategy to use
+        prompt: The user prompt/question
+        response: The response to extract activations for
+        tokenizer: The tokenizer (needs apply_chat_template for chat strategies)
+        other_response: For mc_balanced, the other response option
+        is_positive: For mc_balanced, whether 'response' is the positive option
+    Returns:
+        Tuple of (full_text, answer_text, prompt_only_text)
+        - full_text: Complete text to feed to model
+        - answer_text: The answer portion (for strategies that need it)
+        - prompt_only_text: Prompt without answer (for boundary detection)
+    """
+    if strategy in (ExtractionStrategy.CHAT_MEAN, ExtractionStrategy.CHAT_FIRST,
+                    ExtractionStrategy.CHAT_LAST, ExtractionStrategy.CHAT_GEN_POINT,
+                    ExtractionStrategy.CHAT_MAX_NORM, ExtractionStrategy.CHAT_WEIGHTED):
+        # All chat_* strategies use the same prompt construction
+        if hasattr(tokenizer, "apply_chat_template"):
+            try:
+                prompt_only = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+                full_text = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt},
+                     {"role": "assistant", "content": response}],
+                    tokenize=False,
+                    add_generation_prompt=False,
+                )
+            except (ValueError, KeyError):
+                # Fallback for models without chat template
+                prompt_only = prompt
+                full_text = f"{prompt} {response}"
+        else:
+            prompt_only = prompt
+            full_text = f"{prompt} {response}"
+        return full_text, response, prompt_only
+    elif strategy == ExtractionStrategy.ROLE_PLAY:
+        # "Behave like a person that would answer Q with A" format
+        random_token = ROLE_PLAY_TOKENS[hash(prompt) % len(ROLE_PLAY_TOKENS)]
+        instruction = f"Behave like a person that would answer {prompt} with {response}"
+        if hasattr(tokenizer, "apply_chat_template"):
+            try:
+                prompt_only = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": instruction}],
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+                full_text = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": instruction},
+                     {"role": "assistant", "content": random_token}],
+                    tokenize=False,
+                    add_generation_prompt=False,
+                )
+            except (ValueError, KeyError):
+                prompt_only = instruction
+                full_text = f"{instruction} {random_token}"
+        else:
+            prompt_only = instruction
+            full_text = f"{instruction} {random_token}"
+        return full_text, random_token, prompt_only
+    elif strategy == ExtractionStrategy.MC_BALANCED:
+        # Multiple choice with balanced A/B assignment
+        if other_response is None:
+            raise ValueError("MC_BALANCED strategy requires other_response")
+        # Deterministic "random" based on prompt - same for both pos and neg of a pair
+        pos_goes_in_b = hash(prompt) % 2 == 0
+        if is_positive:
+            if pos_goes_in_b:
+                option_a = other_response[:200]  # negative
+                option_b = response[:200]        # positive
+                answer = "B"
+            else:
+                option_a = response[:200]        # positive
+                option_b = other_response[:200]  # negative
+                answer = "A"
+        else:
+            if pos_goes_in_b:
+                option_a = response[:200]        # negative
+                option_b = other_response[:200]  # positive
+                answer = "A"
+            else:
+                option_a = other_response[:200]  # positive
+                option_b = response[:200]        # negative
+                answer = "B"
+        mc_prompt = f"Which is correct?\nA. {option_a}\nB. {option_b}\nAnswer:"
+        if hasattr(tokenizer, "apply_chat_template"):
+            try:
+                prompt_only = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": mc_prompt}],
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+                full_text = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": mc_prompt},
+                     {"role": "assistant", "content": answer}],
+                    tokenize=False,
+                    add_generation_prompt=False,
+                )
+            except (ValueError, KeyError):
+                prompt_only = mc_prompt
+                full_text = f"{mc_prompt} {answer}"
+        else:
+            prompt_only = mc_prompt
+            full_text = f"{mc_prompt} {answer}"
+        return full_text, answer, prompt_only
+    else:
+        raise ValueError(f"Unknown extraction strategy: {strategy}")
+def extract_activation(
+    strategy: ExtractionStrategy,
+    hidden_states: torch.Tensor,
+    answer_text: str,
+    tokenizer,
+    prompt_len: int,
+) -> torch.Tensor:
+    """
+    Extract the activation vector based on strategy.
+    Args:
+        strategy: The extraction strategy
+        hidden_states: Hidden states tensor of shape [seq_len, hidden_dim]
+        answer_text: The answer text (for computing answer token count)
+        tokenizer: The tokenizer
+        prompt_len: Length of prompt in tokens (boundary)
+    Returns:
+        Activation vector of shape [hidden_dim]
+    """
+    seq_len = hidden_states.shape[0]
+    # Compute answer token count
+    answer_tokens = tokenizer(answer_text, add_special_tokens=False)["input_ids"]
+    num_answer_tokens = len(answer_tokens)
+    if strategy == ExtractionStrategy.CHAT_LAST:
+        return hidden_states[-1]
+    elif strategy == ExtractionStrategy.CHAT_FIRST:
+        # First token of the answer
+        first_answer_idx = max(0, seq_len - num_answer_tokens - 1)
+        return hidden_states[first_answer_idx]
+    elif strategy == ExtractionStrategy.CHAT_MEAN:
+        # Mean of answer tokens
+        if num_answer_tokens > 0 and seq_len > num_answer_tokens:
+            answer_hidden = hidden_states[-num_answer_tokens-1:-1]
+            return answer_hidden.mean(dim=0)
+        return hidden_states[-1]
+    elif strategy == ExtractionStrategy.CHAT_GEN_POINT:
+        # Last token before answer starts (decision point)
+        gen_point_idx = max(0, seq_len - num_answer_tokens - 2)
+        return hidden_states[gen_point_idx]
+    elif strategy == ExtractionStrategy.CHAT_MAX_NORM:
+        # Token with max norm in answer region
+        if num_answer_tokens > 0 and seq_len > num_answer_tokens:
+            answer_hidden = hidden_states[-num_answer_tokens-1:-1]
+            norms = torch.norm(answer_hidden, dim=1)
+            max_idx = torch.argmax(norms)
+            return answer_hidden[max_idx]
+        return hidden_states[-1]
+    elif strategy == ExtractionStrategy.CHAT_WEIGHTED:
+        # Position-weighted mean (earlier tokens weighted more)
+        if num_answer_tokens > 0 and seq_len > num_answer_tokens:
+            answer_hidden = hidden_states[-num_answer_tokens-1:-1]
+            weights = torch.exp(-torch.arange(answer_hidden.shape[0], dtype=torch.float32, device=answer_hidden.device) * 0.5)
+            weights = weights / weights.sum()
+            return (answer_hidden * weights.unsqueeze(1)).sum(dim=0)
+        return hidden_states[-1]
+    elif strategy in (ExtractionStrategy.ROLE_PLAY, ExtractionStrategy.MC_BALANCED):
+        # Both use last token
+        return hidden_states[-1]
+    else:
+        # Default fallback
+        return hidden_states[-1]
+def add_extraction_strategy_args(parser: argparse.ArgumentParser) -> None:
+    """
+    Add --extraction-strategy argument to an argument parser.
+    Usage:
+        parser = argparse.ArgumentParser()
+        add_extraction_strategy_args(parser)
+        args = parser.parse_args()
+        strategy = ExtractionStrategy(args.extraction_strategy)
+    """
+    parser.add_argument(
+        "--extraction-strategy",
+        type=str,
+        default=ExtractionStrategy.default().value,
+        choices=ExtractionStrategy.list_all(),
+        help=f"Extraction strategy for activations. Options: {', '.join(ExtractionStrategy.list_all())}. Default: {ExtractionStrategy.default().value}",
+    )

wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.701py3-none-any.whl