PyPI - mi-crow - Versions diffs - 0.1.2__py3-none-any.whl → 1.0.0.post1__py3-none-any.whl - Mend

mi-crow 0.1.2py3-none-any.whl → 1.0.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

mi_crow/datasets/base_dataset.py +71 -1
mi_crow/datasets/classification_dataset.py +136 -30
mi_crow/datasets/text_dataset.py +165 -24
mi_crow/hooks/controller.py +12 -7
mi_crow/hooks/implementations/layer_activation_detector.py +30 -34
mi_crow/hooks/implementations/model_input_detector.py +87 -87
mi_crow/hooks/implementations/model_output_detector.py +43 -42
mi_crow/hooks/utils.py +74 -0
mi_crow/language_model/activations.py +174 -77
mi_crow/language_model/device_manager.py +119 -0
mi_crow/language_model/inference.py +18 -5
mi_crow/language_model/initialization.py +10 -6
mi_crow/language_model/language_model.py +67 -97
mi_crow/language_model/layers.py +16 -13
mi_crow/language_model/persistence.py +4 -2
mi_crow/language_model/utils.py +5 -5
mi_crow/mechanistic/sae/concepts/autoencoder_concepts.py +157 -95
mi_crow/mechanistic/sae/concepts/concept_dictionary.py +12 -2
mi_crow/mechanistic/sae/concepts/text_heap.py +161 -0
mi_crow/mechanistic/sae/modules/topk_sae.py +29 -22
mi_crow/mechanistic/sae/sae.py +3 -1
mi_crow/mechanistic/sae/sae_trainer.py +362 -29
mi_crow/store/local_store.py +11 -5
mi_crow/store/store.py +34 -1
{mi_crow-0.1.2.dist-info → mi_crow-1.0.0.post1.dist-info}/METADATA +2 -1
{mi_crow-0.1.2.dist-info → mi_crow-1.0.0.post1.dist-info}/RECORD +28 -26
{mi_crow-0.1.2.dist-info → mi_crow-1.0.0.post1.dist-info}/WHEEL +1 -1
{mi_crow-0.1.2.dist-info → mi_crow-1.0.0.post1.dist-info}/top_level.txt +0 -0

mi_crow/mechanistic/sae/concepts/autoencoder_concepts.py CHANGED Viewed

@@ -10,6 +10,7 @@ import torch
 from torch import nn
 from mi_crow.mechanistic.sae.concepts.concept_models import NeuronText
+from mi_crow.mechanistic.sae.concepts.text_heap import TextHeap
 from mi_crow.mechanistic.sae.autoencoder_context import AutoencoderContext
 from mi_crow.utils import get_logger
@@ -28,12 +29,11 @@ class AutoencoderConcepts:
         self._n_size = context.n_latents
         self.dictionary: ConceptDictionary | None = None
-        # Concept manipulation parameters
         self.multiplication = nn.Parameter(torch.ones(self._n_size))
         self.bias = nn.Parameter(torch.ones(self._n_size))
-        # Top texts tracking
-        self._top_texts_heaps: list[list[tuple[float, tuple[float, str, int]]]] | None = None
+        self._text_heaps_positive: list[TextHeap] | None = None
+        self._text_heaps_negative: list[TextHeap] | None = None
         self._text_tracking_k: int = 5
         self._text_tracking_negative: bool = False
@@ -81,7 +81,7 @@ class AutoencoderConcepts:
     def generate_concepts_with_llm(self, llm_provider: str | None = None):
         """Generate concepts using LLM based on current top texts"""
-        if self._top_texts_heaps is None:
+        if self._text_heaps_positive is None:
             raise ValueError("No top texts available. Enable text tracking and run inference first.")
         from mi_crow.mechanistic.sae.concepts.concept_dictionary import ConceptDictionary
@@ -96,8 +96,10 @@ class AutoencoderConcepts:
     def _ensure_heaps(self, n_neurons: int) -> None:
         """Ensure heaps are initialized for the given number of neurons."""
-        if self._top_texts_heaps is None:
-            self._top_texts_heaps = [[] for _ in range(n_neurons)]
+        if self._text_heaps_positive is None:
+            self._text_heaps_positive = [TextHeap(self._text_tracking_k) for _ in range(n_neurons)]
+        if self._text_tracking_negative and self._text_heaps_negative is None:
+            self._text_heaps_negative = [TextHeap(self._text_tracking_k) for _ in range(n_neurons)]
     def _decode_token(self, text: str, token_idx: int) -> str:
         """
@@ -148,6 +150,11 @@ class AutoencoderConcepts:
         """
         Update top texts heaps from latents and texts.
+        Optimized version that:
+        - Only processes active neurons (non-zero activations)
+        - Vectorizes argmax/argmin operations
+        - Eliminates per-neuron tensor slicing
         Args:
             latents: Latent activations tensor, shape [B*T, n_latents] or [B, n_latents] (already flattened)
             texts: List of texts corresponding to the batch
@@ -173,100 +180,120 @@ class AutoencoderConcepts:
                 # Use the actual number of texts as batch size
                 B = original_B
                 T = BT // B if B > 0 else 1
-            # Create token indices: [0, 1, 2, ..., T-1, 0, 1, 2, ..., T-1, ...]
-            token_indices = torch.arange(T, device='cpu').unsqueeze(0).expand(B, T).contiguous().view(B * T)
         else:
             # Original was [B, D], latents are [B, n_latents]
-            # All tokens are at index 0
+            B = original_B
             T = 1
-            token_indices = torch.zeros(BT, dtype=torch.long, device='cpu')
-        # For each neuron, find the maximum activation per text
-        # This ensures we only track the best activation for each text, not every token position
-        for j in range(n_neurons):
-            heap = self._top_texts_heaps[j]
-            # For each text in the batch, find the max activation and its token position
-            texts_processed = 0
-            texts_added = 0
-            texts_updated = 0
-            texts_skipped_duplicate = 0
+        # OPTIMIZATION 1: Find active neurons (have any non-zero activation across batch)
+        # Shape: [n_neurons] - boolean mask
+        active_neurons_mask = (latents.abs().sum(dim=0) > 0)
+        active_neuron_indices = torch.nonzero(active_neurons_mask, as_tuple=False).flatten().tolist()
+        if not active_neuron_indices:
+            return  # No active neurons, skip
+        # OPTIMIZATION 2: Vectorize argmax/argmin for all neurons at once
+        if original_shape is not None and len(original_shape) == 3:
+            # Reshape to [B, T, n_neurons]
+            latents_3d = latents.view(B, T, n_neurons)
+            # For each text, find max/min across tokens for each neuron
+            # Shape: [B, n_neurons] - max activation per text per neuron
+            max_activations, max_token_indices_3d = latents_3d.max(dim=1)  # [B, n_neurons]
+            min_activations, min_token_indices_3d = latents_3d.min(dim=1)  # [B, n_neurons]
+            # max_token_indices_3d is already the token index (0 to T-1)
+            max_token_indices = max_token_indices_3d
+            min_token_indices = min_token_indices_3d
+        else:
+            # Shape: [B, n_neurons]
+            latents_2d = latents.view(B, n_neurons)
+            max_activations = latents_2d  # [B, n_neurons]
+            max_token_indices = torch.zeros(B, n_neurons, dtype=torch.long, device=latents.device)
+            min_activations = latents_2d
+            min_token_indices = torch.zeros(B, n_neurons, dtype=torch.long, device=latents.device)
+        # Convert to numpy for faster CPU access (already on CPU from l1_sae.py)
+        max_activations_np = max_activations.cpu().numpy()
+        min_activations_np = min_activations.cpu().numpy()
+        max_token_indices_np = max_token_indices.cpu().numpy()
+        min_token_indices_np = min_token_indices.cpu().numpy()
+        # OPTIMIZATION 3: Only process active neurons
+        for j in active_neuron_indices:
+            heap_positive = self._text_heaps_positive[j]
+            heap_negative = self._text_heaps_negative[j] if self._text_tracking_negative else None
+            # OPTIMIZATION 4: Batch process all texts for this neuron
             for batch_idx in range(original_B):
                 if batch_idx >= len(texts):
                     continue
                 text = texts[batch_idx]
-                texts_processed += 1
-                # Get activations for this text (all token positions)
-                if original_shape is not None and len(original_shape) == 3:
-                    # 3D case: [B, T, D] -> get slice for this batch
-                    start_idx = batch_idx * T
-                    end_idx = start_idx + T
-                    text_activations = latents[start_idx:end_idx, j]  # [T]
-                    text_token_indices = token_indices[start_idx:end_idx]  # [T]
-                else:
-                    # 2D case: [B, D] -> single token
-                    text_activations = latents[batch_idx:batch_idx + 1, j]  # [1]
-                    text_token_indices = token_indices[batch_idx:batch_idx + 1]  # [1]
-                # Find the maximum activation (or minimum if tracking negative)
-                if self._text_tracking_negative:
-                    # For negative tracking, find the most negative (minimum) value
-                    max_idx = torch.argmin(text_activations)
-                    max_score = float(text_activations[max_idx].item())
-                    adj = -max_score  # Negate for heap ordering
-                else:
-                    # For positive tracking, find the maximum value
-                    max_idx = torch.argmax(text_activations)
-                    max_score = float(text_activations[max_idx].item())
-                    adj = max_score
-                # Skip if score is zero (no activation)
-                if max_score == 0.0:
-                    continue
-                token_idx = int(text_token_indices[max_idx].item())
-                # Check if we already have this text in the heap
-                # If so, only update if this activation is better
-                existing_entry = None
-                heap_texts = []
-                for heap_idx, (heap_adj, (heap_score, heap_text, heap_token_idx)) in enumerate(heap):
-                    heap_texts.append(heap_text[:50] if len(heap_text) > 50 else heap_text)
-                    if heap_text == text:
-                        existing_entry = (heap_idx, heap_adj, heap_score, heap_token_idx)
-                        break
-                if existing_entry is not None:
-                    # Update existing entry if this activation is better
-                    heap_idx, heap_adj, heap_score, heap_token_idx = existing_entry
-                    if adj > heap_adj:
-                        # Replace with better activation
-                        heap[heap_idx] = (adj, (max_score, text, token_idx))
-                        heapq.heapify(heap)  # Re-heapify after modification
-                        texts_updated += 1
-                    else:
-                        texts_skipped_duplicate += 1
-                else:
-                    # New text, add to heap
-                    if len(heap) < self._text_tracking_k:
-                        heapq.heappush(heap, (adj, (max_score, text, token_idx)))
-                        texts_added += 1
-                    else:
-                        # Compare with smallest adjusted score; replace if better
-                        if adj > heap[0][0]:
-                            heapq.heapreplace(heap, (adj, (max_score, text, token_idx)))
-                            texts_added += 1
+                # Use pre-computed max/min (no tensor slicing needed!)
+                max_score_positive = float(max_activations_np[batch_idx, j])
+                token_idx_positive = int(max_token_indices_np[batch_idx, j])
+                if max_score_positive > 0.0:
+                    heap_positive.update(text, max_score_positive, token_idx_positive)
+                if self._text_tracking_negative and heap_negative is not None:
+                    min_score_negative = float(min_activations_np[batch_idx, j])
+                    if min_score_negative != 0.0:
+                        token_idx_negative = int(min_token_indices_np[batch_idx, j])
+                        heap_negative.update(text, min_score_negative, token_idx_negative, adjusted_score=-min_score_negative)
+    def _extract_activations(
+        self,
+        latents: torch.Tensor,
+        token_indices: torch.Tensor,
+        batch_idx: int,
+        neuron_idx: int,
+        original_shape: tuple[int, ...] | None,
+        T: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Extract activations for a specific batch item and neuron.
+        Returns:
+            Tuple of (text_activations, text_token_indices)
+        """
+        if original_shape is not None and len(original_shape) == 3:
+            start_idx = batch_idx * T
+            end_idx = start_idx + T
+            text_activations = latents[start_idx:end_idx, neuron_idx]
+            text_token_indices = token_indices[start_idx:end_idx]
+        else:
+            text_activations = latents[batch_idx:batch_idx + 1, neuron_idx]
+            text_token_indices = token_indices[batch_idx:batch_idx + 1]
+        return text_activations, text_token_indices
     def get_top_texts_for_neuron(self, neuron_idx: int, top_m: int | None = None) -> list[NeuronText]:
-        """Get top texts for a specific neuron."""
-        if self._top_texts_heaps is None or neuron_idx < 0 or neuron_idx >= len(self._top_texts_heaps):
+        """Get top texts for a specific neuron (positive activations)."""
+        if self._text_heaps_positive is None or neuron_idx < 0 or neuron_idx >= len(self._text_heaps_positive):
+            return []
+        heap = self._text_heaps_positive[neuron_idx]
+        items = heap.get_items()
+        items_sorted = sorted(items, key=lambda s_t: s_t[0], reverse=True)
+        if top_m is not None:
+            items_sorted = items_sorted[: top_m]
+        neuron_texts = []
+        for score, text, token_idx in items_sorted:
+            token_str = self._decode_token(text, token_idx)
+            neuron_texts.append(NeuronText(score=score, text=text, token_idx=token_idx, token_str=token_str))
+        return neuron_texts
+    def get_bottom_texts_for_neuron(self, neuron_idx: int, top_m: int | None = None) -> list[NeuronText]:
+        """Get bottom texts for a specific neuron (negative activations)."""
+        if not self._text_tracking_negative:
+            return []
+        if self._text_heaps_negative is None or neuron_idx < 0 or neuron_idx >= len(self._text_heaps_negative):
             return []
-        heap = self._top_texts_heaps[neuron_idx]
-        items = [val for (_, val) in heap]
-        reverse = not self._text_tracking_negative
-        items_sorted = sorted(items, key=lambda s_t: s_t[0], reverse=reverse)
+        heap = self._text_heaps_negative[neuron_idx]
+        items = heap.get_items()
+        items_sorted = sorted(items, key=lambda s_t: s_t[0], reverse=False)
         if top_m is not None:
             items_sorted = items_sorted[: top_m]
@@ -277,17 +304,25 @@ class AutoencoderConcepts:
         return neuron_texts
     def get_all_top_texts(self) -> list[list[NeuronText]]:
-        """Get top texts for all neurons."""
-        if self._top_texts_heaps is None:
+        """Get top texts for all neurons (positive activations)."""
+        if self._text_heaps_positive is None:
             return []
-        return [self.get_top_texts_for_neuron(i) for i in range(len(self._top_texts_heaps))]
+        return [self.get_top_texts_for_neuron(i) for i in range(len(self._text_heaps_positive))]
+    def get_all_bottom_texts(self) -> list[list[NeuronText]]:
+        """Get bottom texts for all neurons (negative activations)."""
+        if not self._text_tracking_negative or self._text_heaps_negative is None:
+            return []
+        return [self.get_bottom_texts_for_neuron(i) for i in range(len(self._text_heaps_negative))]
     def reset_top_texts(self) -> None:
         """Reset all tracked top texts."""
-        self._top_texts_heaps = None
+        self._text_heaps_positive = None
+        self._text_heaps_negative = None
     def export_top_texts_to_json(self, filepath: Path | str) -> Path:
-        if self._top_texts_heaps is None:
+        """Export top texts (positive activations) to JSON file."""
+        if self._text_heaps_positive is None:
             raise ValueError("No top texts available. Enable text tracking and run inference first.")
         filepath = Path(filepath)
@@ -312,8 +347,35 @@ class AutoencoderConcepts:
         return filepath
+    def export_bottom_texts_to_json(self, filepath: Path | str) -> Path:
+        """Export bottom texts (negative activations) to JSON file."""
+        if not self._text_tracking_negative or self._text_heaps_negative is None:
+            raise ValueError("No bottom texts available. Enable negative text tracking and run inference first.")
+        filepath = Path(filepath)
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        all_texts = self.get_all_bottom_texts()
+        export_data = {}
+        for neuron_idx, neuron_texts in enumerate(all_texts):
+            export_data[neuron_idx] = [
+                {
+                    "text": nt.text,
+                    "score": nt.score,
+                    "token_str": nt.token_str,
+                    "token_idx": nt.token_idx
+                }
+                for nt in neuron_texts
+            ]
+        with filepath.open("w", encoding="utf-8") as f:
+            json.dump(export_data, f, ensure_ascii=False, indent=2)
+        return filepath
     def export_top_texts_to_csv(self, filepath: Path | str) -> Path:
-        if self._top_texts_heaps is None:
+        if self._text_heaps_positive is None:
             raise ValueError("No top texts available. Enable text tracking and run inference first.")
         filepath = Path(filepath)

mi_crow/mechanistic/sae/concepts/concept_dictionary.py CHANGED Viewed

@@ -135,8 +135,18 @@ class ConceptDictionary:
         with json_path.open("r", encoding="utf-8") as f:
             data = json.load(f)
-        for neuron_idx_str, concepts in data.items():
-            neuron_idx = int(neuron_idx_str)
+        if isinstance(data, dict) and "concepts" in data:
+            concepts_data = data["concepts"]
+            if "n_size" in data:
+                concept_dict.n_size = int(data["n_size"])
+        else:
+            concepts_data = data
+        for neuron_idx_str, concepts in concepts_data.items():
+            try:
+                neuron_idx = int(neuron_idx_str)
+            except ValueError:
+                continue
             # Handle both old format (list) and new format (single dict)
             if isinstance(concepts, list):

mi_crow/mechanistic/sae/concepts/text_heap.py ADDED Viewed

@@ -0,0 +1,161 @@
+from __future__ import annotations
+import heapq
+class TextHeap:
+    """
+    Efficient heap for tracking top texts with O(1) duplicate lookup.
+    Optimized with incremental index updates and correct heap operations.
+    Maintains a min-heap of size k and a dictionary for fast text lookup.
+    """
+    def __init__(self, max_size: int):
+        """
+        Initialize TextHeap.
+        Args:
+            max_size: Maximum number of items to keep in the heap
+        """
+        self._max_size = max_size
+        self._heap: list[tuple[float, tuple[float, str, int]]] = []
+        self._text_to_index: dict[str, int] = {}
+    def update(self, text: str, score: float, token_idx: int, adjusted_score: float | None = None) -> None:
+        """
+        Update heap with a new text entry.
+        Args:
+            text: Text string
+            score: Activation score (actual value to store)
+            token_idx: Token index within the text
+            adjusted_score: Optional adjusted score for heap ordering (defaults to score)
+        """
+        if adjusted_score is None:
+            adjusted_score = score
+        heap_idx = self._text_to_index.get(text)
+        if heap_idx is not None:
+            self._update_existing(heap_idx, text, adjusted_score, score, token_idx)
+        else:
+            self._add_new(text, adjusted_score, score, token_idx)
+    def _update_existing(
+        self,
+        heap_idx: int,
+        text: str,
+        adjusted_score: float,
+        score: float,
+        token_idx: int
+    ) -> None:
+        """Update an existing entry in the heap."""
+        current_adj = self._heap[heap_idx][0]
+        if adjusted_score > current_adj:
+            self._heap[heap_idx] = (adjusted_score, (score, text, token_idx))
+            self._text_to_index[text] = heap_idx
+            self._siftdown_with_tracking(heap_idx)
+    def _add_new(
+        self,
+        text: str,
+        adjusted_score: float,
+        score: float,
+        token_idx: int
+    ) -> None:
+        """Add a new entry to the heap."""
+        if len(self._heap) < self._max_size:
+            self._heap.append((adjusted_score, (score, text, token_idx)))
+            new_idx = len(self._heap) - 1
+            self._text_to_index[text] = new_idx
+            self._siftup_with_tracking(new_idx)
+        else:
+            if adjusted_score > self._heap[0][0]:
+                self._replace_minimum(text, adjusted_score, score, token_idx)
+    def _replace_minimum(
+        self,
+        text: str,
+        adjusted_score: float,
+        score: float,
+        token_idx: int
+    ) -> None:
+        """Replace the minimum element in the heap."""
+        old_text = self._heap[0][1][1]
+        if old_text in self._text_to_index:
+            del self._text_to_index[old_text]
+        self._heap[0] = (adjusted_score, (score, text, token_idx))
+        self._text_to_index[text] = 0
+        self._siftdown_with_tracking(0)
+    def _siftup_with_tracking(self, pos: int) -> None:
+        """
+        Sift element up in heap (toward root) and update text-to-index map incrementally.
+        Used when value decreases - compares with parent and moves up.
+        Only updates indices that actually change during the sift operation.
+        """
+        startpos = pos
+        newitem = self._heap[pos]
+        newitem_text = newitem[1][1]
+        while pos > 0:
+            parentpos = (pos - 1) >> 1
+            parent = self._heap[parentpos]
+            if newitem[0] >= parent[0]:
+                break
+            parent_text = parent[1][1]
+            self._heap[pos] = parent
+            self._text_to_index[parent_text] = pos
+            pos = parentpos
+        self._heap[pos] = newitem
+        if pos != startpos:
+            self._text_to_index[newitem_text] = pos
+    def _siftdown_with_tracking(self, pos: int) -> None:
+        """
+        Sift element down in heap and update text-to-index map incrementally.
+        Only updates indices that actually change during the sift operation.
+        """
+        endpos = len(self._heap)
+        startpos = pos
+        newitem = self._heap[pos]
+        newitem_text = newitem[1][1]
+        childpos = 2 * pos + 1
+        while childpos < endpos:
+            rightpos = childpos + 1
+            if rightpos < endpos and self._heap[rightpos][0] < self._heap[childpos][0]:
+                childpos = rightpos
+            if newitem[0] < self._heap[childpos][0]:
+                break
+            child_text = self._heap[childpos][1][1]
+            self._heap[pos] = self._heap[childpos]
+            self._text_to_index[child_text] = pos
+            pos = childpos
+            childpos = 2 * pos + 1
+        self._heap[pos] = newitem
+        if pos != startpos:
+            self._text_to_index[newitem_text] = pos
+    def get_items(self) -> list[tuple[float, str, int]]:
+        """
+        Get all items from the heap, sorted by score (descending).
+        Returns:
+            List of (score, text, token_idx) tuples
+        """
+        return [val for (_, val) in self._heap]
+    def clear(self) -> None:
+        """Clear the heap and text mapping."""
+        self._heap.clear()
+        self._text_to_index.clear()
+    def __len__(self) -> int:
+        """Return the number of items in the heap."""
+        return len(self._heap)

mi_crow/mechanistic/sae/modules/topk_sae.py CHANGED Viewed

@@ -72,13 +72,14 @@ class TopKSae(Sae):
             A temporary default k=1 is used for engine initialization and will be
             overridden with the actual k value from config during training.
         """
-        # Set temporary default k for engine initialization (base class calls _initialize_sae_engine)
-        # This will be overridden with the actual k from config during training
-        self.k: int = 1
         super().__init__(n_latents, n_inputs, hook_id, device, store, *args, **kwargs)
-    def _initialize_sae_engine(self) -> OvercompleteSAE:
-        """Initialize the SAE engine with the current k value.
+    def _initialize_sae_engine(self, k: int = 1) -> OvercompleteSAE:
+        """
+        Initialize the SAE engine with the specified k value.
+        Args:
+            k: Number of top activations to keep (default: 1 for initialization)
         Note:
             k should be set from TopKSaeTrainingConfig during training.
@@ -87,7 +88,7 @@ class TopKSae(Sae):
         return OvercompleteTopkSAE(
             input_shape=self.context.n_inputs,
             nb_concepts=self.context.n_latents,
-            top_k=self.k,
+            top_k=k,
             device=self.context.device
         )
@@ -143,8 +144,7 @@ class TopKSae(Sae):
         Train TopKSAE using activations from a Store.
         This method delegates to the SaeTrainer composite class.
-        If k is provided in the config and differs from the current k, the SAE engine
-        will be reinitialized with the config's k value.
+        The SAE engine will be reinitialized with the k value from config.
         Args:
             store: Store instance containing activations
@@ -170,15 +170,13 @@ class TopKSae(Sae):
                 "Example: TopKSaeTrainingConfig(k=10, epochs=100, ...)"
             )
-        # Set k from config and initialize/reinitialize the engine if needed
-        if self.k != config.k:
-            if self.k is None:
-                logger.info(f"Initializing SAE engine with k={config.k}")
-            else:
-                logger.info(f"Reinitializing SAE engine with k={config.k} (was k={self.k})")
-            self.k = config.k
-            # Initialize or reinitialize the SAE engine with k from config
-            self.sae_engine = self._initialize_sae_engine()
+        # Reinitialize engine with k from config
+        logger.info(f"Initializing SAE engine with k={config.k}")
+        self.sae_engine = self._initialize_sae_engine(k=config.k)
+        if hasattr(config, 'device') and config.device:
+            device = torch.device(config.device)
+            self.sae_engine.to(device)
+            self.context.device = str(device)
         return self.trainer.train(store, run_id, layer_signature, config, training_run_id)
@@ -323,13 +321,14 @@ class TopKSae(Sae):
                 result[0] = reconstructed
             return tuple(result)
-    def save(self, name: str, path: str | Path | None = None) -> None:
+    def save(self, name: str, path: str | Path | None = None, k: int | None = None) -> None:
         """
         Save model using overcomplete's state dict + our metadata.
         Args:
             name: Model name
             path: Directory path to save to (defaults to current directory)
+            k: Top-K value to save (if None, attempts to get from engine or raises error)
         """
         if path is None:
             path = Path.cwd()
@@ -340,6 +339,16 @@ class TopKSae(Sae):
         # Save overcomplete model state dict
         sae_state_dict = self.sae_engine.state_dict()
+        # Get k value - prefer parameter, then try to get from engine
+        if k is None:
+            if hasattr(self.sae_engine, 'top_k'):
+                k = self.sae_engine.top_k
+            else:
+                raise ValueError(
+                    "k parameter must be provided to save() method. "
+                    "The engine does not expose top_k attribute."
+                )
         mi_crow_metadata = {
             "concepts_state": {
                 'multiplication': self.concepts.multiplication.data,
@@ -347,7 +356,7 @@ class TopKSae(Sae):
             },
             "n_latents": self.context.n_latents,
             "n_inputs": self.context.n_inputs,
-            "k": self.k,
+            "k": k,
             "device": self.context.device,
             "layer_signature": self.context.lm_layer_signature,
             "model_id": self.context.model_id,
@@ -403,9 +412,7 @@ class TopKSae(Sae):
             device=device
         )
-        # Set k from saved metadata and reinitialize engine with correct k
-        topk_sae.k = k
-        topk_sae.sae_engine = topk_sae._initialize_sae_engine()
+        topk_sae.sae_engine = topk_sae._initialize_sae_engine(k=k)
         # Load overcomplete model state dict
         if "sae_state_dict" in payload:

mi_crow/mechanistic/sae/sae.py CHANGED Viewed

@@ -69,7 +69,7 @@ class Sae(Controller, Detector, abc.ABC):
         """Set the LanguageModelContext for this hook and sync to AutoencoderContext.
         When the hook is registered, this method is called with the LanguageModelContext.
-        It automatically syncs relevant values to the AutoencoderContext.
+        It automatically syncs relevant values to the AutoencoderContext, including device.
         Args:
             context: The LanguageModelContext instance from the LanguageModel
@@ -84,6 +84,8 @@ class Sae(Controller, Detector, abc.ABC):
                 self._autoencoder_context.store = context.store
             if self.layer_signature is not None:
                 self._autoencoder_context.lm_layer_signature = self.layer_signature
+            if context.device is not None:
+                self._autoencoder_context.device = context.device
     @abc.abstractmethod
     def _initialize_sae_engine(self) -> OvercompleteSAE:

mi-crow 0.1.2__py3-none-any.whl → 1.0.0.post1__py3-none-any.whl

mi-crow 0.1.2py3-none-any.whl → 1.0.0.post1py3-none-any.whl