rl-interrogate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ """rl_interrogate — Mechanistic interpretability toolkit for RL policies."""
2
+
3
+ from rl_interrogate.probing import LinearProbe, MLPProbe, LassoProbe, UntrainedBaseline
4
+ from rl_interrogate.ablation import AblationHook
5
+ from rl_interrogate.patching import ActivationPatcher
6
+ from rl_interrogate.polarity import PolarityTest
7
+ from rl_interrogate.pca_utils import fit_pca, project_subspace
8
+ from rl_interrogate.visualization import plot_probe_heatmap, plot_ablation_curve, plot_pca_scatter
9
+ from rl_interrogate.env_wrappers import NoisyObservationWrapper
10
+
11
+ __all__ = [
12
+ "LinearProbe",
13
+ "MLPProbe",
14
+ "LassoProbe",
15
+ "UntrainedBaseline",
16
+ "AblationHook",
17
+ "ActivationPatcher",
18
+ "PolarityTest",
19
+ "fit_pca",
20
+ "project_subspace",
21
+ "plot_probe_heatmap",
22
+ "plot_ablation_curve",
23
+ "plot_pca_scatter",
24
+ "NoisyObservationWrapper",
25
+ ]
@@ -0,0 +1,106 @@
1
+ """AblationHook: scales projection of activations onto a subspace by alpha."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from contextlib import contextmanager
6
+ from typing import Generator
7
+
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+
13
+ class AblationHook:
14
+ """Scales projection of activations onto W by alpha via forward hook.
15
+
16
+ Registers a forward hook on ``model[layer_idx]`` that replaces the layer
17
+ output ``h`` with::
18
+
19
+ proj = h @ W @ W.T
20
+ h_new = h - proj + alpha * proj
21
+
22
+ Setting ``alpha=0.0`` fully removes the subspace component; ``alpha=1.0``
23
+ leaves activations unchanged.
24
+
25
+ Args:
26
+ model: A torch.nn.Sequential whose layers are accessible by integer index.
27
+ layer_idx: Index of the layer on which to register the hook.
28
+ W: Subspace basis matrix or 1-D direction vector. Shape (d_hidden, k) or
29
+ (d_hidden,). Columns should be orthonormal. Also accepted as keyword
30
+ argument ``direction`` for single-direction ablation.
31
+ alpha: Scaling factor for the projected component. 0.0 = full ablation,
32
+ 1.0 = no-op. Can be overridden per-call via :meth:`apply`.
33
+
34
+ Example::
35
+
36
+ hook = AblationHook(policy_net, layer_idx=5, direction=probe.coef_)
37
+ with hook.apply(alpha=0.0):
38
+ rewards = run_episodes(model, env, n=100)
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ model: nn.Module,
44
+ layer_idx: int,
45
+ W: np.ndarray | None = None,
46
+ alpha: float = 0.0,
47
+ *,
48
+ direction: np.ndarray | None = None,
49
+ ) -> None:
50
+ # Accept either W or direction keyword
51
+ basis = W if W is not None else direction
52
+ if basis is None:
53
+ raise ValueError("Provide either W (subspace basis) or direction (1-D vector).")
54
+
55
+ device = next(model.parameters()).device
56
+ dtype = next(model.parameters()).dtype
57
+ basis_arr = np.asarray(basis, dtype=np.float32)
58
+ if basis_arr.ndim == 1:
59
+ # Single direction: normalise and reshape to (d, 1)
60
+ norm = float(np.linalg.norm(basis_arr))
61
+ if norm > 1e-9:
62
+ basis_arr = basis_arr / norm
63
+ basis_arr = basis_arr[:, None]
64
+ self._W = torch.tensor(basis_arr, dtype=dtype, device=device) # (d_hidden, k)
65
+ self._alpha = alpha
66
+ self._model = model
67
+ self._layer_idx = layer_idx
68
+ self._hook = model[layer_idx].register_forward_hook(self._ablate)
69
+
70
+ def _ablate(
71
+ self,
72
+ module: nn.Module,
73
+ input: tuple,
74
+ output: torch.Tensor,
75
+ ) -> torch.Tensor:
76
+ """Forward hook: scale the subspace projection by alpha."""
77
+ proj = output @ self._W @ self._W.T
78
+ return output - proj + self._alpha * proj
79
+
80
+ def remove(self) -> None:
81
+ """Deregister the forward hook, restoring normal forward-pass behaviour."""
82
+ self._hook.remove()
83
+
84
+ @contextmanager
85
+ def apply(self, alpha: float) -> Generator[None, None, None]:
86
+ """Context manager that temporarily applies ablation at the given alpha.
87
+
88
+ Removes any existing hook, registers a new one with *alpha*, yields,
89
+ then restores the original hook.
90
+
91
+ Args:
92
+ alpha: Ablation strength. 0.0 = full ablation, 1.0 = no-op.
93
+
94
+ Yields:
95
+ None
96
+ """
97
+ self._hook.remove()
98
+ old_alpha = self._alpha
99
+ self._alpha = alpha
100
+ self._hook = self._model[self._layer_idx].register_forward_hook(self._ablate)
101
+ try:
102
+ yield
103
+ finally:
104
+ self._hook.remove()
105
+ self._alpha = old_alpha
106
+ self._hook = self._model[self._layer_idx].register_forward_hook(self._ablate)
@@ -0,0 +1,147 @@
1
+ """
2
+ rl_interrogate/env_wrappers.py
3
+ -------------------------------
4
+ Gymnasium environment wrappers for formation-flight policy evaluation.
5
+
6
+ No Isaac Lab dependency — only imports gymnasium, numpy, and standard library.
7
+ Importable from rl_interrogate without activating the Isaac Lab environment.
8
+
9
+ Usage:
10
+ from rl_interrogate.env_wrappers import NoisyObservationWrapper
11
+
12
+ env = NoisyObservationWrapper(
13
+ base_env,
14
+ lateral_noise_std=0.05,
15
+ sensor_zero_prob=0.10,
16
+ rel_lead_delay=2,
17
+ seed=0,
18
+ )
19
+ obs, info = env.reset()
20
+ obs, reward, terminated, truncated, info = env.step(action)
21
+ """
22
+
23
+ import numpy as np
24
+ import gymnasium
25
+
26
+
27
+ class NoisyObservationWrapper(gymnasium.Wrapper):
28
+ """
29
+ Adds sensor noise and delays to a formation-flight gymnasium environment.
30
+ NO Isaac Lab dependency — only imports gymnasium, numpy.
31
+
32
+ Three perturbations are applied to each observation:
33
+ 1. Gaussian noise on the lateral position (index 15 per follower).
34
+ 2. Random zeroing of the upwash sensor (index 12) with probability
35
+ ``sensor_zero_prob`` per step.
36
+ 3. Temporal delay of rel_to_lead observations (indices 9–11) by
37
+ ``rel_lead_delay`` timesteps, using a circular buffer initialised to
38
+ zeros at reset.
39
+
40
+ Args:
41
+ env: The wrapped gymnasium environment.
42
+ lateral_noise_std: Std of Gaussian noise on lateral position obs (default 0.05).
43
+ sensor_zero_prob: Probability per step of zeroing upwash sensor index 12 (default 0.10).
44
+ rel_lead_delay: Number of timesteps to delay rel_to_lead obs indices 9-11 (default 2).
45
+ seed: RNG seed for reproducibility (default 0).
46
+ """
47
+
48
+ UPWASH_IDX: int = 12
49
+ REL_LEAD_SLICE: slice = slice(9, 12)
50
+ LATERAL_IDX: int = 15 # lateral y position index in per-follower obs
51
+
52
+ def __init__(
53
+ self,
54
+ env: gymnasium.Env,
55
+ lateral_noise_std: float = 0.05,
56
+ sensor_zero_prob: float = 0.10,
57
+ rel_lead_delay: int = 2,
58
+ seed: int = 0,
59
+ ) -> None:
60
+ super().__init__(env)
61
+ self.lateral_noise_std = lateral_noise_std
62
+ self.sensor_zero_prob = sensor_zero_prob
63
+ self.rel_lead_delay = rel_lead_delay
64
+ self._rng = np.random.default_rng(seed)
65
+
66
+ # Circular buffer — initialised properly in reset()
67
+ self._delay_buffer = np.zeros((self.rel_lead_delay, 3), dtype=np.float32)
68
+ self._buf_ptr: int = 0
69
+
70
+ # ------------------------------------------------------------------
71
+ # Public gymnasium.Wrapper interface
72
+ # ------------------------------------------------------------------
73
+
74
+ def reset(self, **kwargs):
75
+ """Reset env, zero delay buffer, apply noise to initial obs."""
76
+ # Zero the delay buffer and reset pointer
77
+ self._delay_buffer = np.zeros((self.rel_lead_delay, 3), dtype=np.float32)
78
+ self._buf_ptr = 0
79
+
80
+ result = self.env.reset(**kwargs)
81
+
82
+ # reset() may return (obs, info) or just obs depending on the env
83
+ if isinstance(result, tuple):
84
+ obs, *rest = result
85
+ obs = self._apply_noise(self._to_numpy(obs))
86
+ return (obs, *rest)
87
+ else:
88
+ obs = self._apply_noise(self._to_numpy(result))
89
+ return obs
90
+
91
+ def step(self, action):
92
+ """Step env, apply noise/delay to obs, return modified obs."""
93
+ result = self.env.step(action)
94
+
95
+ # step() returns (obs, reward, terminated, truncated, info)
96
+ obs, *rest = result
97
+ obs = self._apply_noise(self._to_numpy(obs))
98
+ return (obs, *rest)
99
+
100
+ # ------------------------------------------------------------------
101
+ # Internal helpers
102
+ # ------------------------------------------------------------------
103
+
104
+ def _to_numpy(self, obs) -> np.ndarray:
105
+ """Convert obs to a writable numpy float32 array if needed."""
106
+ try:
107
+ import torch
108
+ if isinstance(obs, torch.Tensor):
109
+ return obs.detach().cpu().numpy().astype(np.float32)
110
+ except ImportError:
111
+ pass
112
+ arr = np.asarray(obs, dtype=np.float32)
113
+ return arr.copy() # ensure writeable
114
+
115
+ def _apply_noise(self, obs: np.ndarray) -> np.ndarray:
116
+ """
117
+ Apply all three perturbations to a single observation array.
118
+
119
+ Modifies ONLY indices 9–12 and index 15 (per follower).
120
+ All other indices are left unchanged.
121
+
122
+ Args:
123
+ obs: numpy float32 array of shape (obs_dim,) or (batch, obs_dim).
124
+
125
+ Returns:
126
+ Modified obs array (same shape and dtype).
127
+ """
128
+ # --- 1. Delay: read oldest entry from circular buffer ---
129
+ delayed = self._delay_buffer[self._buf_ptr].copy()
130
+
131
+ # --- 2. Write current rel_to_lead into the buffer slot ---
132
+ self._delay_buffer[self._buf_ptr] = obs[..., 9:12]
133
+
134
+ # --- 3. Advance pointer ---
135
+ self._buf_ptr = (self._buf_ptr + 1) % self.rel_lead_delay
136
+
137
+ # --- 4. Replace obs[9:12] with the delayed value ---
138
+ obs[..., 9:12] = delayed
139
+
140
+ # --- 5. Add Gaussian noise to lateral y (index 15) ---
141
+ obs[..., self.LATERAL_IDX] += self._rng.normal(0, self.lateral_noise_std)
142
+
143
+ # --- 6. Zero upwash sensor (index 12) with prob sensor_zero_prob ---
144
+ if self._rng.random() < self.sensor_zero_prob:
145
+ obs[..., self.UPWASH_IDX] = 0.0
146
+
147
+ return obs
@@ -0,0 +1,35 @@
1
+ """LLM Polarity Inversion Validation Module
2
+
3
+ This module implements cross-domain validation of the Polarity Inversion Discriminant (PID)
4
+ applied to Large Language Models. The PID tests whether a feature is causally wired into
5
+ a network's decision-making by inverting its polarity via hyperplane reflection and measuring
6
+ the change in probe R².
7
+ """
8
+
9
+ from rl_interrogate.llm_polarity.model_loader import ModelLoader
10
+ from rl_interrogate.llm_polarity.dataset_builder import DatasetBuilder
11
+ from rl_interrogate.llm_polarity.direction_extractor import DirectionExtractor
12
+ from rl_interrogate.llm_polarity.probe_trainer import ProbeTrainer
13
+ from rl_interrogate.llm_polarity.hyperplane_reflector import HyperplaneReflector
14
+ from rl_interrogate.llm_polarity.output_analyzer import OutputAnalyzer
15
+ from rl_interrogate.llm_polarity.control_runner import ControlRunner
16
+ from rl_interrogate.llm_polarity.results_writer import ResultsWriter
17
+ from rl_interrogate.llm_polarity.data_models import (
18
+ ExperimentResult,
19
+ ControlResult,
20
+ TransferResult,
21
+ )
22
+
23
+ __all__ = [
24
+ "ModelLoader",
25
+ "DatasetBuilder",
26
+ "DirectionExtractor",
27
+ "ProbeTrainer",
28
+ "HyperplaneReflector",
29
+ "OutputAnalyzer",
30
+ "ControlRunner",
31
+ "ResultsWriter",
32
+ "ExperimentResult",
33
+ "ControlResult",
34
+ "TransferResult",
35
+ ]
@@ -0,0 +1,203 @@
1
+ """ControlRunner component for executing control experiments to validate PID signal."""
2
+
3
+ import numpy as np
4
+ from typing import Dict
5
+ from transformers import PreTrainedModel
6
+
7
+ from .probe_trainer import ProbeTrainer
8
+ from .hyperplane_reflector import HyperplaneReflector
9
+ from .model_loader import ModelLoader
10
+
11
+
12
+ class ControlRunner:
13
+ """Execute control experiments to validate PID signal.
14
+
15
+ This class implements control experiments that test whether the PID signal
16
+ is spurious by using random directions, shuffled labels, and untrained models.
17
+ """
18
+
19
+ @staticmethod
20
+ def run_random_direction_control(
21
+ model: PreTrainedModel,
22
+ layer_idx: int,
23
+ X: np.ndarray,
24
+ y: np.ndarray,
25
+ n_trials: int = 10,
26
+ alpha: float = 1.0,
27
+ test_size: float = 0.2,
28
+ random_state: int = 42,
29
+ ) -> Dict[str, float]:
30
+ """Run PID with random unit directions.
31
+
32
+ Args:
33
+ model: Pre-trained language model (not used, kept for API consistency)
34
+ layer_idx: Layer index (not used, kept for API consistency)
35
+ X: Activations of shape (n_samples, d_model)
36
+ y: Labels of shape (n_samples,)
37
+ n_trials: Number of random direction trials
38
+ alpha: Ridge regression regularization parameter
39
+ test_size: Fraction of data for testing
40
+ random_state: Random seed for reproducibility
41
+
42
+ Returns:
43
+ Dictionary with mean_delta_r2, std_delta_r2, mean_probe_r2
44
+ """
45
+ delta_r2_values = []
46
+ probe_r2_values = []
47
+
48
+ np.random.seed(random_state)
49
+
50
+ for trial in range(n_trials):
51
+ # Train probe
52
+ trainer = ProbeTrainer(alpha=alpha, test_size=test_size, random_state=random_state + trial)
53
+ trainer.fit(X, y)
54
+
55
+ # Evaluate pre-inversion
56
+ train_r2, test_r2 = trainer.evaluate_test_set()
57
+ r2_pre = test_r2
58
+ probe_r2_values.append(r2_pre)
59
+
60
+ # Generate random unit direction
61
+ d_model = X.shape[1]
62
+ random_direction = np.random.randn(d_model)
63
+ random_direction = random_direction / np.linalg.norm(random_direction)
64
+
65
+ # Apply reflection to test set
66
+ X_test_inv = HyperplaneReflector.reflect(trainer.X_test, random_direction)
67
+
68
+ # Evaluate post-inversion
69
+ r2_post = trainer.evaluate(X_test_inv, trainer.y_test)
70
+
71
+ # Compute ΔR²
72
+ delta_r2 = r2_post - r2_pre
73
+ delta_r2_values.append(delta_r2)
74
+
75
+ return {
76
+ 'mean_delta_r2': float(np.mean(delta_r2_values)),
77
+ 'std_delta_r2': float(np.std(delta_r2_values)),
78
+ 'mean_probe_r2': float(np.mean(probe_r2_values)),
79
+ }
80
+
81
+ @staticmethod
82
+ def run_shuffled_labels_control(
83
+ model: PreTrainedModel,
84
+ layer_idx: int,
85
+ X: np.ndarray,
86
+ y: np.ndarray,
87
+ n_trials: int = 10,
88
+ alpha: float = 1.0,
89
+ test_size: float = 0.2,
90
+ random_state: int = 42,
91
+ ) -> Dict[str, float]:
92
+ """Run PID with shuffled labels.
93
+
94
+ Args:
95
+ model: Pre-trained language model (not used, kept for API consistency)
96
+ layer_idx: Layer index (not used, kept for API consistency)
97
+ X: Activations of shape (n_samples, d_model)
98
+ y: Labels of shape (n_samples,)
99
+ n_trials: Number of shuffled label trials
100
+ alpha: Ridge regression regularization parameter
101
+ test_size: Fraction of data for testing
102
+ random_state: Random seed for reproducibility
103
+
104
+ Returns:
105
+ Dictionary with mean_delta_r2, std_delta_r2, mean_probe_r2
106
+ """
107
+ delta_r2_values = []
108
+ probe_r2_values = []
109
+
110
+ np.random.seed(random_state)
111
+
112
+ for trial in range(n_trials):
113
+ # Shuffle labels
114
+ y_shuffled = y.copy()
115
+ np.random.shuffle(y_shuffled)
116
+
117
+ # Train probe with shuffled labels
118
+ trainer = ProbeTrainer(alpha=alpha, test_size=test_size, random_state=random_state + trial)
119
+ trainer.fit(X, y_shuffled)
120
+
121
+ # Evaluate pre-inversion
122
+ train_r2, test_r2 = trainer.evaluate_test_set()
123
+ r2_pre = test_r2
124
+ probe_r2_values.append(r2_pre)
125
+
126
+ # Extract probe direction
127
+ probe_direction = trainer.get_probe_direction()
128
+
129
+ # Apply reflection to test set
130
+ X_test_inv = HyperplaneReflector.reflect(trainer.X_test, probe_direction)
131
+
132
+ # Evaluate post-inversion
133
+ r2_post = trainer.evaluate(X_test_inv, trainer.y_test)
134
+
135
+ # Compute ΔR²
136
+ delta_r2 = r2_post - r2_pre
137
+ delta_r2_values.append(delta_r2)
138
+
139
+ return {
140
+ 'mean_delta_r2': float(np.mean(delta_r2_values)),
141
+ 'std_delta_r2': float(np.std(delta_r2_values)),
142
+ 'mean_probe_r2': float(np.mean(probe_r2_values)),
143
+ }
144
+
145
+ @staticmethod
146
+ def run_untrained_model_control(
147
+ model_name: str,
148
+ layer_idx: int,
149
+ X: np.ndarray,
150
+ y: np.ndarray,
151
+ alpha: float = 1.0,
152
+ test_size: float = 0.2,
153
+ random_state: int = 42,
154
+ ) -> Dict[str, float]:
155
+ """Run PID on randomly initialized model.
156
+
157
+ Note: This control uses the provided activations X directly rather than
158
+ extracting from an untrained model, as the activations are already provided.
159
+ The key insight is that with random/untrained representations, the probe
160
+ should have low R² and ΔR² should be near zero.
161
+
162
+ Args:
163
+ model_name: Model name (for logging purposes)
164
+ layer_idx: Layer index (for logging purposes)
165
+ X: Activations of shape (n_samples, d_model)
166
+ y: Labels of shape (n_samples,)
167
+ alpha: Ridge regression regularization parameter
168
+ test_size: Fraction of data for testing
169
+ random_state: Random seed for reproducibility
170
+
171
+ Returns:
172
+ Dictionary with mean_delta_r2, std_delta_r2, mean_probe_r2
173
+ """
174
+ # For untrained model control, we use random activations
175
+ # to simulate what an untrained model would produce
176
+ np.random.seed(random_state)
177
+ X_random = np.random.randn(*X.shape)
178
+
179
+ # Train probe on random activations
180
+ trainer = ProbeTrainer(alpha=alpha, test_size=test_size, random_state=random_state)
181
+ trainer.fit(X_random, y)
182
+
183
+ # Evaluate pre-inversion
184
+ train_r2, test_r2 = trainer.evaluate_test_set()
185
+ r2_pre = test_r2
186
+
187
+ # Extract probe direction
188
+ probe_direction = trainer.get_probe_direction()
189
+
190
+ # Apply reflection to test set
191
+ X_test_inv = HyperplaneReflector.reflect(trainer.X_test, probe_direction)
192
+
193
+ # Evaluate post-inversion
194
+ r2_post = trainer.evaluate(X_test_inv, trainer.y_test)
195
+
196
+ # Compute ΔR²
197
+ delta_r2 = r2_post - r2_pre
198
+
199
+ return {
200
+ 'mean_delta_r2': float(delta_r2),
201
+ 'std_delta_r2': 0.0, # Single trial
202
+ 'mean_probe_r2': float(r2_pre),
203
+ }
@@ -0,0 +1,86 @@
1
+ """Data models for LLM Polarity Inversion Validation experiments."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass
8
+ class ExperimentResult:
9
+ """Results from a single PID experiment.
10
+
11
+ Attributes:
12
+ model_name: Name of the model (e.g., "gpt2", "gpt2-medium", "llama-3.2-1b")
13
+ direction: Semantic direction (e.g., "sentiment", "factual", "odd_even", "toxicity")
14
+ layer_idx: Absolute layer index
15
+ layer_depth_pct: Layer depth as percentage (0.0, 0.25, 0.50, 0.75, 1.0)
16
+ probe_r2: R² before inversion
17
+ probe_accuracy: Classification accuracy before inversion
18
+ post_inversion_r2: R² after inversion
19
+ post_inversion_accuracy: Classification accuracy after inversion
20
+ delta_r2: R²_post - R²_pre
21
+ top_token_change_rate: Fraction of inputs with changed top token
22
+ kl_divergence: Mean KL divergence
23
+ perplexity_delta: Perplexity change
24
+ n_samples: Dataset size
25
+ runtime_seconds: Experiment duration
26
+ """
27
+ model_name: str
28
+ direction: str
29
+ layer_idx: int
30
+ layer_depth_pct: float
31
+ probe_r2: float
32
+ probe_accuracy: float
33
+ post_inversion_r2: float
34
+ post_inversion_accuracy: float
35
+ delta_r2: float
36
+ top_token_change_rate: float
37
+ kl_divergence: float
38
+ perplexity_delta: float
39
+ n_samples: int
40
+ runtime_seconds: float
41
+
42
+
43
+ @dataclass
44
+ class ControlResult:
45
+ """Results from a control experiment.
46
+
47
+ Attributes:
48
+ control_type: Type of control ("random_direction", "shuffled_labels", "untrained")
49
+ model_name: Name of the model
50
+ direction: Semantic direction
51
+ layer_idx: Absolute layer index
52
+ mean_delta_r2: Mean ΔR² across trials
53
+ std_delta_r2: Standard deviation of ΔR² across trials
54
+ mean_probe_r2: Mean probe R² across trials
55
+ n_trials: Number of trials
56
+ """
57
+ control_type: str
58
+ model_name: str
59
+ direction: str
60
+ layer_idx: int
61
+ mean_delta_r2: float
62
+ std_delta_r2: float
63
+ mean_probe_r2: float
64
+ n_trials: int
65
+
66
+
67
+ @dataclass
68
+ class TransferResult:
69
+ """Results from cross-model transfer experiment.
70
+
71
+ Attributes:
72
+ source_model: Model probe was trained on
73
+ target_model: Model probe was tested on
74
+ direction: Semantic direction
75
+ layer_idx: Absolute layer index (matched by depth percentage)
76
+ source_r2: R² on source model
77
+ transfer_r2: R² on target model
78
+ transfer_ratio: transfer_r2 / source_r2
79
+ """
80
+ source_model: str
81
+ target_model: str
82
+ direction: str
83
+ layer_idx: int
84
+ source_r2: float
85
+ transfer_r2: float
86
+ transfer_ratio: float