PyPI - mi-crow - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.0.post1__py3-none-any.whl - Mend

mi-crow 1.0.0py3-none-any.whl → 1.0.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

mi_crow/datasets/base_dataset.py +71 -1
mi_crow/datasets/classification_dataset.py +136 -30
mi_crow/datasets/text_dataset.py +165 -24
mi_crow/hooks/controller.py +12 -7
mi_crow/hooks/implementations/layer_activation_detector.py +30 -34
mi_crow/hooks/implementations/model_input_detector.py +87 -87
mi_crow/hooks/implementations/model_output_detector.py +43 -42
mi_crow/hooks/utils.py +74 -0
mi_crow/language_model/activations.py +174 -77
mi_crow/language_model/device_manager.py +119 -0
mi_crow/language_model/inference.py +18 -5
mi_crow/language_model/initialization.py +10 -6
mi_crow/language_model/language_model.py +67 -97
mi_crow/language_model/layers.py +16 -13
mi_crow/language_model/persistence.py +4 -2
mi_crow/language_model/utils.py +5 -5
mi_crow/mechanistic/sae/concepts/autoencoder_concepts.py +157 -95
mi_crow/mechanistic/sae/concepts/concept_dictionary.py +12 -2
mi_crow/mechanistic/sae/concepts/text_heap.py +161 -0
mi_crow/mechanistic/sae/modules/topk_sae.py +29 -22
mi_crow/mechanistic/sae/sae.py +3 -1
mi_crow/mechanistic/sae/sae_trainer.py +362 -29
mi_crow/store/local_store.py +11 -5
mi_crow/store/store.py +34 -1
{mi_crow-1.0.0.dist-info → mi_crow-1.0.0.post1.dist-info}/METADATA +2 -1
{mi_crow-1.0.0.dist-info → mi_crow-1.0.0.post1.dist-info}/RECORD +28 -26
{mi_crow-1.0.0.dist-info → mi_crow-1.0.0.post1.dist-info}/WHEEL +0 -0
{mi_crow-1.0.0.dist-info → mi_crow-1.0.0.post1.dist-info}/top_level.txt +0 -0

mi_crow/hooks/implementations/layer_activation_detector.py CHANGED Viewed

@@ -1,68 +1,59 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import torch
 from mi_crow.hooks.detector import Detector
-from mi_crow.hooks.hook import HookType, HOOK_FUNCTION_INPUT, HOOK_FUNCTION_OUTPUT
+from mi_crow.hooks.hook import HOOK_FUNCTION_INPUT, HOOK_FUNCTION_OUTPUT, HookType
 from mi_crow.hooks.utils import extract_tensor_from_output
 if TYPE_CHECKING:
-    from torch import nn
+    pass
 class LayerActivationDetector(Detector):
     """
     Detector hook that captures and saves activations during inference.
     This detector extracts activations from layer outputs and stores them
     for later use (e.g., saving to disk, further analysis).
     """
-    def __init__(
-            self,
-            layer_signature: str | int,
-            hook_id: str | None = None
-    ):
+    def __init__(self, layer_signature: str | int, hook_id: str | None = None, target_dtype: torch.dtype | None = None):
         """
         Initialize the activation saver detector.
         Args:
             layer_signature: Layer to capture activations from
             hook_id: Unique identifier for this hook
+            target_dtype: Optional dtype to convert activations to before storing
         Raises:
             ValueError: If layer_signature is None
         """
         if layer_signature is None:
             raise ValueError("layer_signature cannot be None for LayerActivationDetector")
-        super().__init__(
-            hook_type=HookType.FORWARD,
-            hook_id=hook_id,
-            store=None,
-            layer_signature=layer_signature
-        )
+        super().__init__(hook_type=HookType.FORWARD, hook_id=hook_id, store=None, layer_signature=layer_signature)
+        self.target_dtype = target_dtype
     def process_activations(
-            self,
-            module: torch.nn.Module,
-            input: HOOK_FUNCTION_INPUT,
-            output: HOOK_FUNCTION_OUTPUT
+        self, module: torch.nn.Module, input: HOOK_FUNCTION_INPUT, output: HOOK_FUNCTION_OUTPUT
     ) -> None:
         """
         Extract and store activations from output.
         Handles various output types:
         - Plain tensors
         - Tuples/lists of tensors (takes first tensor)
         - Objects with last_hidden_state attribute (e.g., HuggingFace outputs)
         Args:
             module: The PyTorch module being hooked
             input: Tuple of input tensors to the module
             output: Output tensor(s) from the module
         Raises:
             RuntimeError: If tensor extraction or storage fails
         """
@@ -70,27 +61,32 @@ class LayerActivationDetector(Detector):
             tensor = extract_tensor_from_output(output)
             if tensor is not None:
-                tensor_cpu = tensor.detach().to("cpu")
-                # Store current batch's tensor (overwrites previous)
-                self.tensor_metadata['activations'] = tensor_cpu
-                # Store activations shape to metadata
-                self.metadata['activations_shape'] = tuple(tensor_cpu.shape)
+                if tensor.is_cuda:
+                    tensor_cpu = tensor.detach().to("cpu", non_blocking=True)
+                else:
+                    tensor_cpu = tensor.detach()
+                if self.target_dtype is not None:
+                    tensor_cpu = tensor_cpu.to(self.target_dtype)
+                self.tensor_metadata["activations"] = tensor_cpu
+                self.metadata["activations_shape"] = tuple(tensor_cpu.shape)
         except Exception as e:
+            layer_sig = str(self.layer_signature) if self.layer_signature is not None else "unknown"
             raise RuntimeError(
-                f"Error extracting activations in LayerActivationDetector {self.id}: {e}"
+                f"Error extracting activations in LayerActivationDetector {self.id} (layer={layer_sig}): {e}"
             ) from e
     def get_captured(self) -> torch.Tensor | None:
         """
         Get the captured activations from the current batch.
         Returns:
             The captured activation tensor from the current batch or None if no activations captured yet
         """
-        return self.tensor_metadata.get('activations')
+        return self.tensor_metadata.get("activations")
     def clear_captured(self) -> None:
         """Clear captured activations for current batch."""
-        self.tensor_metadata.pop('activations', None)
-        self.metadata.pop('activations_shape', None)
+        self.tensor_metadata.pop("activations", None)
+        self.metadata.pop("activations_shape", None)

mi_crow/hooks/implementations/model_input_detector.py CHANGED Viewed

@@ -1,38 +1,39 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Dict, Set, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
 import torch
 from mi_crow.hooks.detector import Detector
-from mi_crow.hooks.hook import HookType, HOOK_FUNCTION_INPUT, HOOK_FUNCTION_OUTPUT
+from mi_crow.hooks.hook import HOOK_FUNCTION_INPUT, HOOK_FUNCTION_OUTPUT, HookType
 if TYPE_CHECKING:
-    from torch import nn
+    pass
 class ModelInputDetector(Detector):
     """
     Detector hook that captures and saves tokenized inputs from model forward pass.
     This detector is designed to be attached to the root model module and captures:
     - Tokenized inputs (input_ids) from the model's forward pass
     - Attention masks (optional) that exclude both padding and special tokens
     Uses PRE_FORWARD hook to capture inputs before they are processed.
     Useful for saving tokenized inputs for analysis or training.
     """
     def __init__(
-            self,
-            layer_signature: str | int | None = None,
-            hook_id: str | None = None,
-            save_input_ids: bool = True,
-            save_attention_mask: bool = False,
-            special_token_ids: Optional[List[int] | Set[int]] = None
+        self,
+        layer_signature: str | int | None = None,
+        hook_id: str | None = None,
+        save_input_ids: bool = True,
+        save_attention_mask: bool = False,
+        special_token_ids: Optional[List[int] | Set[int]] = None,
     ):
         """
         Initialize the model input detector.
         Args:
             layer_signature: Layer to capture from (typically the root model, can be None)
             hook_id: Unique identifier for this hook
@@ -40,12 +41,7 @@ class ModelInputDetector(Detector):
             save_attention_mask: Whether to save attention_mask tensor (excludes padding and special tokens)
             special_token_ids: Optional list/set of special token IDs. If None, will extract from LanguageModel context.
         """
-        super().__init__(
-            hook_type=HookType.PRE_FORWARD,
-            hook_id=hook_id,
-            store=None,
-            layer_signature=layer_signature
-        )
+        super().__init__(hook_type=HookType.PRE_FORWARD, hook_id=hook_id, store=None, layer_signature=layer_signature)
         self.save_input_ids = save_input_ids
         self.save_attention_mask = save_attention_mask
         self.special_token_ids = set(special_token_ids) if special_token_ids is not None else None
@@ -53,90 +49,87 @@ class ModelInputDetector(Detector):
     def _extract_input_ids(self, input: HOOK_FUNCTION_INPUT) -> torch.Tensor | None:
         """
         Extract input_ids from model input.
         Handles various input formats:
         - Dict with 'input_ids' key (most common for HuggingFace models)
         - Tuple with dict as first element
         - Tuple with tensor as first element
         Args:
             input: Input to the model forward pass
         Returns:
             input_ids tensor or None if not found
         """
         if not input or len(input) == 0:
             return None
         first_item = input[0]
         if isinstance(first_item, dict):
-            if 'input_ids' in first_item:
-                return first_item['input_ids']
+            if "input_ids" in first_item:
+                return first_item["input_ids"]
             return None
         if isinstance(first_item, torch.Tensor):
             return first_item
         return None
     def _extract_attention_mask(self, input: HOOK_FUNCTION_INPUT) -> torch.Tensor | None:
         """
         Extract attention_mask from model input.
         Args:
             input: Input to the model forward pass
         Returns:
             attention_mask tensor or None if not found
         """
         if not input or len(input) == 0:
             return None
         first_item = input[0]
         if isinstance(first_item, dict):
-            if 'attention_mask' in first_item:
-                return first_item['attention_mask']
+            if "attention_mask" in first_item:
+                return first_item["attention_mask"]
         return None
     def _get_special_token_ids(self, module: torch.nn.Module) -> Set[int]:
         """
         Get special token IDs from user-provided list or from LanguageModel context.
         Priority order:
         1. self.special_token_ids (user-provided during initialization)
         2. self.context.special_token_ids (from LanguageModel initialization)
         Args:
             module: The PyTorch module being hooked (unused, kept for API compatibility)
         Returns:
             Set of special token IDs, or empty set if none available
         """
         if self.special_token_ids is not None:
             return self.special_token_ids
         if self.context is not None and self.context.special_token_ids is not None:
             return self.context.special_token_ids
         return set()
     def _create_combined_attention_mask(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor | None,
-        module: torch.nn.Module
+        self, input_ids: torch.Tensor, attention_mask: torch.Tensor | None, module: torch.nn.Module
     ) -> torch.Tensor:
         """
         Create a combined attention mask that excludes both padding and special tokens.
         Args:
             input_ids: Input token IDs tensor (batch_size × sequence_length)
             attention_mask: Original attention mask from tokenizer (None if not provided)
             module: The PyTorch module being hooked
         Returns:
             Binary mask tensor with same shape as input_ids (1 for regular tokens, 0 for padding/special tokens)
         """
@@ -144,72 +137,71 @@ class ModelInputDetector(Detector):
             attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
         else:
             attention_mask = attention_mask.bool()
         special_token_ids = self._get_special_token_ids(module)
         if special_token_ids:
             special_ids_tensor = torch.tensor(list(special_token_ids), device=input_ids.device, dtype=input_ids.dtype)
             expanded_input = input_ids.unsqueeze(-1)
             expanded_special = special_ids_tensor.unsqueeze(0).unsqueeze(0)
             is_special = (expanded_input == expanded_special).any(dim=-1)
             attention_mask = attention_mask & ~is_special
         return attention_mask.to(torch.bool)
-    def set_inputs_from_encodings(self, encodings: Dict[str, torch.Tensor], module: Optional[torch.nn.Module] = None) -> None:
+    def set_inputs_from_encodings(
+        self, encodings: Dict[str, torch.Tensor], module: Optional[torch.nn.Module] = None
+    ) -> None:
         """
         Manually set inputs from encodings dictionary.
         This is useful when the model is called with keyword arguments,
         as PyTorch's pre_forward hook doesn't receive kwargs.
         Args:
-            encodings: Dictionary of encoded inputs (e.g., from lm.forwards() or lm.tokenize())
+            encodings: Dictionary of encoded inputs (e.g., from lm.inference.execute_inference() or lm.tokenize())
             module: Optional module for extracting special token IDs. If None, will use DummyModule.
         Raises:
             RuntimeError: If tensor extraction or storage fails
         """
         try:
-            if self.save_input_ids and 'input_ids' in encodings:
-                input_ids = encodings['input_ids']
-                self.tensor_metadata['input_ids'] = input_ids.detach().to("cpu")
-                self.metadata['input_ids_shape'] = tuple(input_ids.shape)
-            if self.save_attention_mask and 'input_ids' in encodings:
-                input_ids = encodings['input_ids']
+            if self.save_input_ids and "input_ids" in encodings:
+                input_ids = encodings["input_ids"]
+                self.tensor_metadata["input_ids"] = input_ids.detach().to("cpu")
+                self.metadata["input_ids_shape"] = tuple(input_ids.shape)
+            if self.save_attention_mask and "input_ids" in encodings:
+                input_ids = encodings["input_ids"]
                 if module is None:
                     class DummyModule:
                         pass
                     module = DummyModule()
-                original_attention_mask = encodings.get('attention_mask')
+                original_attention_mask = encodings.get("attention_mask")
                 combined_mask = self._create_combined_attention_mask(input_ids, original_attention_mask, module)
-                self.tensor_metadata['attention_mask'] = combined_mask.detach().to("cpu")
-                self.metadata['attention_mask_shape'] = tuple(combined_mask.shape)
+                self.tensor_metadata["attention_mask"] = combined_mask.detach().to("cpu")
+                self.metadata["attention_mask_shape"] = tuple(combined_mask.shape)
         except Exception as e:
-            raise RuntimeError(
-                f"Error setting inputs from encodings in ModelInputDetector {self.id}: {e}"
-            ) from e
+            raise RuntimeError(f"Error setting inputs from encodings in ModelInputDetector {self.id}: {e}") from e
     def process_activations(
-            self,
-            module: torch.nn.Module,
-            input: HOOK_FUNCTION_INPUT,
-            output: HOOK_FUNCTION_OUTPUT
+        self, module: torch.nn.Module, input: HOOK_FUNCTION_INPUT, output: HOOK_FUNCTION_OUTPUT
     ) -> None:
         """
         Extract and store tokenized inputs.
         Note: For HuggingFace models called with **kwargs, the input tuple may be empty.
         In such cases, use set_inputs_from_encodings() to manually set inputs from
-        the encodings dictionary returned by lm.forwards().
+        the encodings dictionary returned by lm.inference.execute_inference().
         Args:
             module: The PyTorch module being hooked (typically the root model)
             input: Tuple of input tensors/dicts to the module
             output: Output from the module (None for PRE_FORWARD hooks)
         Raises:
             RuntimeError: If tensor extraction or storage fails
         """
@@ -217,34 +209,42 @@ class ModelInputDetector(Detector):
             if self.save_input_ids:
                 input_ids = self._extract_input_ids(input)
                 if input_ids is not None:
-                    self.tensor_metadata['input_ids'] = input_ids.detach().to("cpu")
-                    self.metadata['input_ids_shape'] = tuple(input_ids.shape)
+                    if input_ids.is_cuda:
+                        input_ids_cpu = input_ids.detach().to("cpu", non_blocking=True)
+                    else:
+                        input_ids_cpu = input_ids.detach()
+                    self.tensor_metadata["input_ids"] = input_ids_cpu
+                    self.metadata["input_ids_shape"] = tuple(input_ids_cpu.shape)
             if self.save_attention_mask:
                 input_ids = self._extract_input_ids(input)
                 if input_ids is not None:
                     original_attention_mask = self._extract_attention_mask(input)
                     combined_mask = self._create_combined_attention_mask(input_ids, original_attention_mask, module)
-                    self.tensor_metadata['attention_mask'] = combined_mask.detach().to("cpu")
-                    self.metadata['attention_mask_shape'] = tuple(combined_mask.shape)
+                    if combined_mask.is_cuda:
+                        combined_mask_cpu = combined_mask.detach().to("cpu", non_blocking=True)
+                    else:
+                        combined_mask_cpu = combined_mask.detach()
+                    self.tensor_metadata["attention_mask"] = combined_mask_cpu
+                    self.metadata["attention_mask_shape"] = tuple(combined_mask_cpu.shape)
         except Exception as e:
+            layer_sig = str(self.layer_signature) if self.layer_signature is not None else "unknown"
             raise RuntimeError(
-                f"Error extracting inputs in ModelInputDetector {self.id}: {e}"
+                f"Error extracting inputs in ModelInputDetector {self.id} (layer={layer_sig}): {e}"
             ) from e
     def get_captured_input_ids(self) -> torch.Tensor | None:
         """Get the captured input_ids from the current batch."""
-        return self.tensor_metadata.get('input_ids')
+        return self.tensor_metadata.get("input_ids")
     def get_captured_attention_mask(self) -> torch.Tensor | None:
         """Get the captured attention_mask from the current batch (excludes padding and special tokens)."""
-        return self.tensor_metadata.get('attention_mask')
+        return self.tensor_metadata.get("attention_mask")
     def clear_captured(self) -> None:
         """Clear all captured inputs for current batch."""
-        keys_to_remove = ['input_ids', 'attention_mask']
+        keys_to_remove = ["input_ids", "attention_mask"]
         for key in keys_to_remove:
             self.tensor_metadata.pop(key, None)
-            self.metadata.pop(f'{key}_shape', None)
+            self.metadata.pop(f"{key}_shape", None)

mi_crow/hooks/implementations/model_output_detector.py CHANGED Viewed

@@ -1,132 +1,133 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import torch
 from mi_crow.hooks.detector import Detector
-from mi_crow.hooks.hook import HookType, HOOK_FUNCTION_INPUT, HOOK_FUNCTION_OUTPUT
+from mi_crow.hooks.hook import HOOK_FUNCTION_INPUT, HOOK_FUNCTION_OUTPUT, HookType
 if TYPE_CHECKING:
-    from torch import nn
+    pass
 class ModelOutputDetector(Detector):
     """
     Detector hook that captures and saves model outputs.
     This detector is designed to be attached to the root model module and captures:
     - Model outputs (logits) from the model's forward pass
     - Hidden states (optional) from the model's forward pass
     Uses FORWARD hook to capture outputs after they are computed.
     Useful for saving model outputs for analysis or training.
     """
     def __init__(
-            self,
-            layer_signature: str | int | None = None,
-            hook_id: str | None = None,
-            save_output_logits: bool = True,
-            save_output_hidden_state: bool = False
+        self,
+        layer_signature: str | int | None = None,
+        hook_id: str | None = None,
+        save_output_logits: bool = True,
+        save_output_hidden_state: bool = False,
     ):
         """
         Initialize the model output detector.
         Args:
             layer_signature: Layer to capture from (typically the root model, can be None)
             hook_id: Unique identifier for this hook
             save_output_logits: Whether to save output logits (if available)
             save_output_hidden_state: Whether to save last_hidden_state (if available)
         """
-        super().__init__(
-            hook_type=HookType.FORWARD,
-            hook_id=hook_id,
-            store=None,
-            layer_signature=layer_signature
-        )
+        super().__init__(hook_type=HookType.FORWARD, hook_id=hook_id, store=None, layer_signature=layer_signature)
         self.save_output_logits = save_output_logits
         self.save_output_hidden_state = save_output_hidden_state
     def _extract_output_tensor(self, output: HOOK_FUNCTION_OUTPUT) -> tuple[torch.Tensor | None, torch.Tensor | None]:
         """
         Extract logits and last_hidden_state from model output.
         Args:
             output: Output from the model forward pass
         Returns:
             Tuple of (logits, last_hidden_state), either can be None
         """
         logits = None
         hidden_state = None
         if output is None:
             return None, None
         # Handle HuggingFace output objects
         if hasattr(output, "logits"):
             logits = output.logits
         if hasattr(output, "last_hidden_state"):
             hidden_state = output.last_hidden_state
         # Handle tuple output (logits might be first element)
         if isinstance(output, (tuple, list)) and len(output) > 0:
             first_item = output[0]
             if isinstance(first_item, torch.Tensor) and logits is None:
                 logits = first_item
         # Handle direct tensor output
         if isinstance(output, torch.Tensor) and logits is None:
             logits = output
         return logits, hidden_state
     def process_activations(
-            self,
-            module: torch.nn.Module,
-            input: HOOK_FUNCTION_INPUT,
-            output: HOOK_FUNCTION_OUTPUT
+        self, module: torch.nn.Module, input: HOOK_FUNCTION_INPUT, output: HOOK_FUNCTION_OUTPUT
     ) -> None:
         """
         Extract and store model outputs.
         Args:
             module: The PyTorch module being hooked (typically the root model)
             input: Tuple of input tensors/dicts to the module
             output: Output from the module
         Raises:
             RuntimeError: If tensor extraction or storage fails
         """
         try:
             # Extract and save outputs
             logits, hidden_state = self._extract_output_tensor(output)
             if self.save_output_logits and logits is not None:
-                self.tensor_metadata['output_logits'] = logits.detach().to("cpu")
-                self.metadata['output_logits_shape'] = tuple(logits.shape)
+                if logits.is_cuda:
+                    logits_cpu = logits.detach().to("cpu", non_blocking=True)
+                else:
+                    logits_cpu = logits.detach()
+                self.tensor_metadata["output_logits"] = logits_cpu
+                self.metadata["output_logits_shape"] = tuple(logits_cpu.shape)
             if self.save_output_hidden_state and hidden_state is not None:
-                self.tensor_metadata['output_hidden_state'] = hidden_state.detach().to("cpu")
-                self.metadata['output_hidden_state_shape'] = tuple(hidden_state.shape)
+                if hidden_state.is_cuda:
+                    hidden_state_cpu = hidden_state.detach().to("cpu", non_blocking=True)
+                else:
+                    hidden_state_cpu = hidden_state.detach()
+                self.tensor_metadata["output_hidden_state"] = hidden_state_cpu
+                self.metadata["output_hidden_state_shape"] = tuple(hidden_state_cpu.shape)
         except Exception as e:
+            layer_sig = str(self.layer_signature) if self.layer_signature is not None else "unknown"
             raise RuntimeError(
-                f"Error extracting outputs in ModelOutputDetector {self.id}: {e}"
+                f"Error extracting outputs in ModelOutputDetector {self.id} (layer={layer_sig}): {e}"
             ) from e
     def get_captured_output_logits(self) -> torch.Tensor | None:
         """Get the captured output logits from the current batch."""
-        return self.tensor_metadata.get('output_logits')
+        return self.tensor_metadata.get("output_logits")
     def get_captured_output_hidden_state(self) -> torch.Tensor | None:
         """Get the captured output hidden state from the current batch."""
-        return self.tensor_metadata.get('output_hidden_state')
+        return self.tensor_metadata.get("output_hidden_state")
     def clear_captured(self) -> None:
         """Clear all captured outputs for current batch."""
-        keys_to_remove = ['output_logits', 'output_hidden_state']
+        keys_to_remove = ["output_logits", "output_hidden_state"]
         for key in keys_to_remove:
             self.tensor_metadata.pop(key, None)
-            self.metadata.pop(f'{key}_shape', None)
+            self.metadata.pop(f"{key}_shape", None)

mi-crow 1.0.0__py3-none-any.whl → 1.0.0.post1__py3-none-any.whl

mi-crow 1.0.0py3-none-any.whl → 1.0.0.post1py3-none-any.whl