rl-interrogate 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rl_interrogate/__init__.py +25 -0
- rl_interrogate/ablation.py +106 -0
- rl_interrogate/env_wrappers.py +147 -0
- rl_interrogate/llm_polarity/__init__.py +35 -0
- rl_interrogate/llm_polarity/control_runner.py +203 -0
- rl_interrogate/llm_polarity/data_models.py +86 -0
- rl_interrogate/llm_polarity/dataset_builder.py +208 -0
- rl_interrogate/llm_polarity/direction_extractor.py +336 -0
- rl_interrogate/llm_polarity/hyperplane_reflector.py +106 -0
- rl_interrogate/llm_polarity/model_loader.py +120 -0
- rl_interrogate/llm_polarity/output_analyzer.py +245 -0
- rl_interrogate/llm_polarity/probe_trainer.py +147 -0
- rl_interrogate/llm_polarity/results_writer.py +295 -0
- rl_interrogate/multi_objective_ablation.py +174 -0
- rl_interrogate/multi_objective_pid.py +254 -0
- rl_interrogate/multi_objective_probing.py +226 -0
- rl_interrogate/multi_objective_training.py +343 -0
- rl_interrogate/multi_reward_halfcheetah.py +152 -0
- rl_interrogate/natural_harmful_utils.py +280 -0
- rl_interrogate/patching.py +75 -0
- rl_interrogate/pca_utils.py +38 -0
- rl_interrogate/polarity.py +180 -0
- rl_interrogate/policies/__init__.py +13 -0
- rl_interrogate/policies/recurrent_policy.py +211 -0
- rl_interrogate/probing.py +289 -0
- rl_interrogate/visualization.py +125 -0
- rl_interrogate-0.1.0.dist-info/METADATA +153 -0
- rl_interrogate-0.1.0.dist-info/RECORD +30 -0
- rl_interrogate-0.1.0.dist-info/WHEEL +4 -0
- rl_interrogate-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""rl_interrogate — Mechanistic interpretability toolkit for RL policies."""
|
|
2
|
+
|
|
3
|
+
from rl_interrogate.probing import LinearProbe, MLPProbe, LassoProbe, UntrainedBaseline
|
|
4
|
+
from rl_interrogate.ablation import AblationHook
|
|
5
|
+
from rl_interrogate.patching import ActivationPatcher
|
|
6
|
+
from rl_interrogate.polarity import PolarityTest
|
|
7
|
+
from rl_interrogate.pca_utils import fit_pca, project_subspace
|
|
8
|
+
from rl_interrogate.visualization import plot_probe_heatmap, plot_ablation_curve, plot_pca_scatter
|
|
9
|
+
from rl_interrogate.env_wrappers import NoisyObservationWrapper
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"LinearProbe",
|
|
13
|
+
"MLPProbe",
|
|
14
|
+
"LassoProbe",
|
|
15
|
+
"UntrainedBaseline",
|
|
16
|
+
"AblationHook",
|
|
17
|
+
"ActivationPatcher",
|
|
18
|
+
"PolarityTest",
|
|
19
|
+
"fit_pca",
|
|
20
|
+
"project_subspace",
|
|
21
|
+
"plot_probe_heatmap",
|
|
22
|
+
"plot_ablation_curve",
|
|
23
|
+
"plot_pca_scatter",
|
|
24
|
+
"NoisyObservationWrapper",
|
|
25
|
+
]
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""AblationHook: scales projection of activations onto a subspace by alpha."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from typing import Generator
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn as nn
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AblationHook:
|
|
14
|
+
"""Scales projection of activations onto W by alpha via forward hook.
|
|
15
|
+
|
|
16
|
+
Registers a forward hook on ``model[layer_idx]`` that replaces the layer
|
|
17
|
+
output ``h`` with::
|
|
18
|
+
|
|
19
|
+
proj = h @ W @ W.T
|
|
20
|
+
h_new = h - proj + alpha * proj
|
|
21
|
+
|
|
22
|
+
Setting ``alpha=0.0`` fully removes the subspace component; ``alpha=1.0``
|
|
23
|
+
leaves activations unchanged.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
model: A torch.nn.Sequential whose layers are accessible by integer index.
|
|
27
|
+
layer_idx: Index of the layer on which to register the hook.
|
|
28
|
+
W: Subspace basis matrix or 1-D direction vector. Shape (d_hidden, k) or
|
|
29
|
+
(d_hidden,). Columns should be orthonormal. Also accepted as keyword
|
|
30
|
+
argument ``direction`` for single-direction ablation.
|
|
31
|
+
alpha: Scaling factor for the projected component. 0.0 = full ablation,
|
|
32
|
+
1.0 = no-op. Can be overridden per-call via :meth:`apply`.
|
|
33
|
+
|
|
34
|
+
Example::
|
|
35
|
+
|
|
36
|
+
hook = AblationHook(policy_net, layer_idx=5, direction=probe.coef_)
|
|
37
|
+
with hook.apply(alpha=0.0):
|
|
38
|
+
rewards = run_episodes(model, env, n=100)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
model: nn.Module,
|
|
44
|
+
layer_idx: int,
|
|
45
|
+
W: np.ndarray | None = None,
|
|
46
|
+
alpha: float = 0.0,
|
|
47
|
+
*,
|
|
48
|
+
direction: np.ndarray | None = None,
|
|
49
|
+
) -> None:
|
|
50
|
+
# Accept either W or direction keyword
|
|
51
|
+
basis = W if W is not None else direction
|
|
52
|
+
if basis is None:
|
|
53
|
+
raise ValueError("Provide either W (subspace basis) or direction (1-D vector).")
|
|
54
|
+
|
|
55
|
+
device = next(model.parameters()).device
|
|
56
|
+
dtype = next(model.parameters()).dtype
|
|
57
|
+
basis_arr = np.asarray(basis, dtype=np.float32)
|
|
58
|
+
if basis_arr.ndim == 1:
|
|
59
|
+
# Single direction: normalise and reshape to (d, 1)
|
|
60
|
+
norm = float(np.linalg.norm(basis_arr))
|
|
61
|
+
if norm > 1e-9:
|
|
62
|
+
basis_arr = basis_arr / norm
|
|
63
|
+
basis_arr = basis_arr[:, None]
|
|
64
|
+
self._W = torch.tensor(basis_arr, dtype=dtype, device=device) # (d_hidden, k)
|
|
65
|
+
self._alpha = alpha
|
|
66
|
+
self._model = model
|
|
67
|
+
self._layer_idx = layer_idx
|
|
68
|
+
self._hook = model[layer_idx].register_forward_hook(self._ablate)
|
|
69
|
+
|
|
70
|
+
def _ablate(
|
|
71
|
+
self,
|
|
72
|
+
module: nn.Module,
|
|
73
|
+
input: tuple,
|
|
74
|
+
output: torch.Tensor,
|
|
75
|
+
) -> torch.Tensor:
|
|
76
|
+
"""Forward hook: scale the subspace projection by alpha."""
|
|
77
|
+
proj = output @ self._W @ self._W.T
|
|
78
|
+
return output - proj + self._alpha * proj
|
|
79
|
+
|
|
80
|
+
def remove(self) -> None:
|
|
81
|
+
"""Deregister the forward hook, restoring normal forward-pass behaviour."""
|
|
82
|
+
self._hook.remove()
|
|
83
|
+
|
|
84
|
+
@contextmanager
|
|
85
|
+
def apply(self, alpha: float) -> Generator[None, None, None]:
|
|
86
|
+
"""Context manager that temporarily applies ablation at the given alpha.
|
|
87
|
+
|
|
88
|
+
Removes any existing hook, registers a new one with *alpha*, yields,
|
|
89
|
+
then restores the original hook.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
alpha: Ablation strength. 0.0 = full ablation, 1.0 = no-op.
|
|
93
|
+
|
|
94
|
+
Yields:
|
|
95
|
+
None
|
|
96
|
+
"""
|
|
97
|
+
self._hook.remove()
|
|
98
|
+
old_alpha = self._alpha
|
|
99
|
+
self._alpha = alpha
|
|
100
|
+
self._hook = self._model[self._layer_idx].register_forward_hook(self._ablate)
|
|
101
|
+
try:
|
|
102
|
+
yield
|
|
103
|
+
finally:
|
|
104
|
+
self._hook.remove()
|
|
105
|
+
self._alpha = old_alpha
|
|
106
|
+
self._hook = self._model[self._layer_idx].register_forward_hook(self._ablate)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""
|
|
2
|
+
rl_interrogate/env_wrappers.py
|
|
3
|
+
-------------------------------
|
|
4
|
+
Gymnasium environment wrappers for formation-flight policy evaluation.
|
|
5
|
+
|
|
6
|
+
No Isaac Lab dependency — only imports gymnasium, numpy, and standard library.
|
|
7
|
+
Importable from rl_interrogate without activating the Isaac Lab environment.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from rl_interrogate.env_wrappers import NoisyObservationWrapper
|
|
11
|
+
|
|
12
|
+
env = NoisyObservationWrapper(
|
|
13
|
+
base_env,
|
|
14
|
+
lateral_noise_std=0.05,
|
|
15
|
+
sensor_zero_prob=0.10,
|
|
16
|
+
rel_lead_delay=2,
|
|
17
|
+
seed=0,
|
|
18
|
+
)
|
|
19
|
+
obs, info = env.reset()
|
|
20
|
+
obs, reward, terminated, truncated, info = env.step(action)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import gymnasium
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NoisyObservationWrapper(gymnasium.Wrapper):
|
|
28
|
+
"""
|
|
29
|
+
Adds sensor noise and delays to a formation-flight gymnasium environment.
|
|
30
|
+
NO Isaac Lab dependency — only imports gymnasium, numpy.
|
|
31
|
+
|
|
32
|
+
Three perturbations are applied to each observation:
|
|
33
|
+
1. Gaussian noise on the lateral position (index 15 per follower).
|
|
34
|
+
2. Random zeroing of the upwash sensor (index 12) with probability
|
|
35
|
+
``sensor_zero_prob`` per step.
|
|
36
|
+
3. Temporal delay of rel_to_lead observations (indices 9–11) by
|
|
37
|
+
``rel_lead_delay`` timesteps, using a circular buffer initialised to
|
|
38
|
+
zeros at reset.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
env: The wrapped gymnasium environment.
|
|
42
|
+
lateral_noise_std: Std of Gaussian noise on lateral position obs (default 0.05).
|
|
43
|
+
sensor_zero_prob: Probability per step of zeroing upwash sensor index 12 (default 0.10).
|
|
44
|
+
rel_lead_delay: Number of timesteps to delay rel_to_lead obs indices 9-11 (default 2).
|
|
45
|
+
seed: RNG seed for reproducibility (default 0).
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
UPWASH_IDX: int = 12
|
|
49
|
+
REL_LEAD_SLICE: slice = slice(9, 12)
|
|
50
|
+
LATERAL_IDX: int = 15 # lateral y position index in per-follower obs
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
env: gymnasium.Env,
|
|
55
|
+
lateral_noise_std: float = 0.05,
|
|
56
|
+
sensor_zero_prob: float = 0.10,
|
|
57
|
+
rel_lead_delay: int = 2,
|
|
58
|
+
seed: int = 0,
|
|
59
|
+
) -> None:
|
|
60
|
+
super().__init__(env)
|
|
61
|
+
self.lateral_noise_std = lateral_noise_std
|
|
62
|
+
self.sensor_zero_prob = sensor_zero_prob
|
|
63
|
+
self.rel_lead_delay = rel_lead_delay
|
|
64
|
+
self._rng = np.random.default_rng(seed)
|
|
65
|
+
|
|
66
|
+
# Circular buffer — initialised properly in reset()
|
|
67
|
+
self._delay_buffer = np.zeros((self.rel_lead_delay, 3), dtype=np.float32)
|
|
68
|
+
self._buf_ptr: int = 0
|
|
69
|
+
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
# Public gymnasium.Wrapper interface
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
def reset(self, **kwargs):
|
|
75
|
+
"""Reset env, zero delay buffer, apply noise to initial obs."""
|
|
76
|
+
# Zero the delay buffer and reset pointer
|
|
77
|
+
self._delay_buffer = np.zeros((self.rel_lead_delay, 3), dtype=np.float32)
|
|
78
|
+
self._buf_ptr = 0
|
|
79
|
+
|
|
80
|
+
result = self.env.reset(**kwargs)
|
|
81
|
+
|
|
82
|
+
# reset() may return (obs, info) or just obs depending on the env
|
|
83
|
+
if isinstance(result, tuple):
|
|
84
|
+
obs, *rest = result
|
|
85
|
+
obs = self._apply_noise(self._to_numpy(obs))
|
|
86
|
+
return (obs, *rest)
|
|
87
|
+
else:
|
|
88
|
+
obs = self._apply_noise(self._to_numpy(result))
|
|
89
|
+
return obs
|
|
90
|
+
|
|
91
|
+
def step(self, action):
|
|
92
|
+
"""Step env, apply noise/delay to obs, return modified obs."""
|
|
93
|
+
result = self.env.step(action)
|
|
94
|
+
|
|
95
|
+
# step() returns (obs, reward, terminated, truncated, info)
|
|
96
|
+
obs, *rest = result
|
|
97
|
+
obs = self._apply_noise(self._to_numpy(obs))
|
|
98
|
+
return (obs, *rest)
|
|
99
|
+
|
|
100
|
+
# ------------------------------------------------------------------
|
|
101
|
+
# Internal helpers
|
|
102
|
+
# ------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
def _to_numpy(self, obs) -> np.ndarray:
|
|
105
|
+
"""Convert obs to a writable numpy float32 array if needed."""
|
|
106
|
+
try:
|
|
107
|
+
import torch
|
|
108
|
+
if isinstance(obs, torch.Tensor):
|
|
109
|
+
return obs.detach().cpu().numpy().astype(np.float32)
|
|
110
|
+
except ImportError:
|
|
111
|
+
pass
|
|
112
|
+
arr = np.asarray(obs, dtype=np.float32)
|
|
113
|
+
return arr.copy() # ensure writeable
|
|
114
|
+
|
|
115
|
+
def _apply_noise(self, obs: np.ndarray) -> np.ndarray:
|
|
116
|
+
"""
|
|
117
|
+
Apply all three perturbations to a single observation array.
|
|
118
|
+
|
|
119
|
+
Modifies ONLY indices 9–12 and index 15 (per follower).
|
|
120
|
+
All other indices are left unchanged.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
obs: numpy float32 array of shape (obs_dim,) or (batch, obs_dim).
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Modified obs array (same shape and dtype).
|
|
127
|
+
"""
|
|
128
|
+
# --- 1. Delay: read oldest entry from circular buffer ---
|
|
129
|
+
delayed = self._delay_buffer[self._buf_ptr].copy()
|
|
130
|
+
|
|
131
|
+
# --- 2. Write current rel_to_lead into the buffer slot ---
|
|
132
|
+
self._delay_buffer[self._buf_ptr] = obs[..., 9:12]
|
|
133
|
+
|
|
134
|
+
# --- 3. Advance pointer ---
|
|
135
|
+
self._buf_ptr = (self._buf_ptr + 1) % self.rel_lead_delay
|
|
136
|
+
|
|
137
|
+
# --- 4. Replace obs[9:12] with the delayed value ---
|
|
138
|
+
obs[..., 9:12] = delayed
|
|
139
|
+
|
|
140
|
+
# --- 5. Add Gaussian noise to lateral y (index 15) ---
|
|
141
|
+
obs[..., self.LATERAL_IDX] += self._rng.normal(0, self.lateral_noise_std)
|
|
142
|
+
|
|
143
|
+
# --- 6. Zero upwash sensor (index 12) with prob sensor_zero_prob ---
|
|
144
|
+
if self._rng.random() < self.sensor_zero_prob:
|
|
145
|
+
obs[..., self.UPWASH_IDX] = 0.0
|
|
146
|
+
|
|
147
|
+
return obs
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""LLM Polarity Inversion Validation Module
|
|
2
|
+
|
|
3
|
+
This module implements cross-domain validation of the Polarity Inversion Discriminant (PID)
|
|
4
|
+
applied to Large Language Models. The PID tests whether a feature is causally wired into
|
|
5
|
+
a network's decision-making by inverting its polarity via hyperplane reflection and measuring
|
|
6
|
+
the change in probe R².
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from rl_interrogate.llm_polarity.model_loader import ModelLoader
|
|
10
|
+
from rl_interrogate.llm_polarity.dataset_builder import DatasetBuilder
|
|
11
|
+
from rl_interrogate.llm_polarity.direction_extractor import DirectionExtractor
|
|
12
|
+
from rl_interrogate.llm_polarity.probe_trainer import ProbeTrainer
|
|
13
|
+
from rl_interrogate.llm_polarity.hyperplane_reflector import HyperplaneReflector
|
|
14
|
+
from rl_interrogate.llm_polarity.output_analyzer import OutputAnalyzer
|
|
15
|
+
from rl_interrogate.llm_polarity.control_runner import ControlRunner
|
|
16
|
+
from rl_interrogate.llm_polarity.results_writer import ResultsWriter
|
|
17
|
+
from rl_interrogate.llm_polarity.data_models import (
|
|
18
|
+
ExperimentResult,
|
|
19
|
+
ControlResult,
|
|
20
|
+
TransferResult,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"ModelLoader",
|
|
25
|
+
"DatasetBuilder",
|
|
26
|
+
"DirectionExtractor",
|
|
27
|
+
"ProbeTrainer",
|
|
28
|
+
"HyperplaneReflector",
|
|
29
|
+
"OutputAnalyzer",
|
|
30
|
+
"ControlRunner",
|
|
31
|
+
"ResultsWriter",
|
|
32
|
+
"ExperimentResult",
|
|
33
|
+
"ControlResult",
|
|
34
|
+
"TransferResult",
|
|
35
|
+
]
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""ControlRunner component for executing control experiments to validate PID signal."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing import Dict
|
|
5
|
+
from transformers import PreTrainedModel
|
|
6
|
+
|
|
7
|
+
from .probe_trainer import ProbeTrainer
|
|
8
|
+
from .hyperplane_reflector import HyperplaneReflector
|
|
9
|
+
from .model_loader import ModelLoader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ControlRunner:
|
|
13
|
+
"""Execute control experiments to validate PID signal.
|
|
14
|
+
|
|
15
|
+
This class implements control experiments that test whether the PID signal
|
|
16
|
+
is spurious by using random directions, shuffled labels, and untrained models.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def run_random_direction_control(
|
|
21
|
+
model: PreTrainedModel,
|
|
22
|
+
layer_idx: int,
|
|
23
|
+
X: np.ndarray,
|
|
24
|
+
y: np.ndarray,
|
|
25
|
+
n_trials: int = 10,
|
|
26
|
+
alpha: float = 1.0,
|
|
27
|
+
test_size: float = 0.2,
|
|
28
|
+
random_state: int = 42,
|
|
29
|
+
) -> Dict[str, float]:
|
|
30
|
+
"""Run PID with random unit directions.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
model: Pre-trained language model (not used, kept for API consistency)
|
|
34
|
+
layer_idx: Layer index (not used, kept for API consistency)
|
|
35
|
+
X: Activations of shape (n_samples, d_model)
|
|
36
|
+
y: Labels of shape (n_samples,)
|
|
37
|
+
n_trials: Number of random direction trials
|
|
38
|
+
alpha: Ridge regression regularization parameter
|
|
39
|
+
test_size: Fraction of data for testing
|
|
40
|
+
random_state: Random seed for reproducibility
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Dictionary with mean_delta_r2, std_delta_r2, mean_probe_r2
|
|
44
|
+
"""
|
|
45
|
+
delta_r2_values = []
|
|
46
|
+
probe_r2_values = []
|
|
47
|
+
|
|
48
|
+
np.random.seed(random_state)
|
|
49
|
+
|
|
50
|
+
for trial in range(n_trials):
|
|
51
|
+
# Train probe
|
|
52
|
+
trainer = ProbeTrainer(alpha=alpha, test_size=test_size, random_state=random_state + trial)
|
|
53
|
+
trainer.fit(X, y)
|
|
54
|
+
|
|
55
|
+
# Evaluate pre-inversion
|
|
56
|
+
train_r2, test_r2 = trainer.evaluate_test_set()
|
|
57
|
+
r2_pre = test_r2
|
|
58
|
+
probe_r2_values.append(r2_pre)
|
|
59
|
+
|
|
60
|
+
# Generate random unit direction
|
|
61
|
+
d_model = X.shape[1]
|
|
62
|
+
random_direction = np.random.randn(d_model)
|
|
63
|
+
random_direction = random_direction / np.linalg.norm(random_direction)
|
|
64
|
+
|
|
65
|
+
# Apply reflection to test set
|
|
66
|
+
X_test_inv = HyperplaneReflector.reflect(trainer.X_test, random_direction)
|
|
67
|
+
|
|
68
|
+
# Evaluate post-inversion
|
|
69
|
+
r2_post = trainer.evaluate(X_test_inv, trainer.y_test)
|
|
70
|
+
|
|
71
|
+
# Compute ΔR²
|
|
72
|
+
delta_r2 = r2_post - r2_pre
|
|
73
|
+
delta_r2_values.append(delta_r2)
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
'mean_delta_r2': float(np.mean(delta_r2_values)),
|
|
77
|
+
'std_delta_r2': float(np.std(delta_r2_values)),
|
|
78
|
+
'mean_probe_r2': float(np.mean(probe_r2_values)),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def run_shuffled_labels_control(
|
|
83
|
+
model: PreTrainedModel,
|
|
84
|
+
layer_idx: int,
|
|
85
|
+
X: np.ndarray,
|
|
86
|
+
y: np.ndarray,
|
|
87
|
+
n_trials: int = 10,
|
|
88
|
+
alpha: float = 1.0,
|
|
89
|
+
test_size: float = 0.2,
|
|
90
|
+
random_state: int = 42,
|
|
91
|
+
) -> Dict[str, float]:
|
|
92
|
+
"""Run PID with shuffled labels.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
model: Pre-trained language model (not used, kept for API consistency)
|
|
96
|
+
layer_idx: Layer index (not used, kept for API consistency)
|
|
97
|
+
X: Activations of shape (n_samples, d_model)
|
|
98
|
+
y: Labels of shape (n_samples,)
|
|
99
|
+
n_trials: Number of shuffled label trials
|
|
100
|
+
alpha: Ridge regression regularization parameter
|
|
101
|
+
test_size: Fraction of data for testing
|
|
102
|
+
random_state: Random seed for reproducibility
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Dictionary with mean_delta_r2, std_delta_r2, mean_probe_r2
|
|
106
|
+
"""
|
|
107
|
+
delta_r2_values = []
|
|
108
|
+
probe_r2_values = []
|
|
109
|
+
|
|
110
|
+
np.random.seed(random_state)
|
|
111
|
+
|
|
112
|
+
for trial in range(n_trials):
|
|
113
|
+
# Shuffle labels
|
|
114
|
+
y_shuffled = y.copy()
|
|
115
|
+
np.random.shuffle(y_shuffled)
|
|
116
|
+
|
|
117
|
+
# Train probe with shuffled labels
|
|
118
|
+
trainer = ProbeTrainer(alpha=alpha, test_size=test_size, random_state=random_state + trial)
|
|
119
|
+
trainer.fit(X, y_shuffled)
|
|
120
|
+
|
|
121
|
+
# Evaluate pre-inversion
|
|
122
|
+
train_r2, test_r2 = trainer.evaluate_test_set()
|
|
123
|
+
r2_pre = test_r2
|
|
124
|
+
probe_r2_values.append(r2_pre)
|
|
125
|
+
|
|
126
|
+
# Extract probe direction
|
|
127
|
+
probe_direction = trainer.get_probe_direction()
|
|
128
|
+
|
|
129
|
+
# Apply reflection to test set
|
|
130
|
+
X_test_inv = HyperplaneReflector.reflect(trainer.X_test, probe_direction)
|
|
131
|
+
|
|
132
|
+
# Evaluate post-inversion
|
|
133
|
+
r2_post = trainer.evaluate(X_test_inv, trainer.y_test)
|
|
134
|
+
|
|
135
|
+
# Compute ΔR²
|
|
136
|
+
delta_r2 = r2_post - r2_pre
|
|
137
|
+
delta_r2_values.append(delta_r2)
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
'mean_delta_r2': float(np.mean(delta_r2_values)),
|
|
141
|
+
'std_delta_r2': float(np.std(delta_r2_values)),
|
|
142
|
+
'mean_probe_r2': float(np.mean(probe_r2_values)),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def run_untrained_model_control(
|
|
147
|
+
model_name: str,
|
|
148
|
+
layer_idx: int,
|
|
149
|
+
X: np.ndarray,
|
|
150
|
+
y: np.ndarray,
|
|
151
|
+
alpha: float = 1.0,
|
|
152
|
+
test_size: float = 0.2,
|
|
153
|
+
random_state: int = 42,
|
|
154
|
+
) -> Dict[str, float]:
|
|
155
|
+
"""Run PID on randomly initialized model.
|
|
156
|
+
|
|
157
|
+
Note: This control uses the provided activations X directly rather than
|
|
158
|
+
extracting from an untrained model, as the activations are already provided.
|
|
159
|
+
The key insight is that with random/untrained representations, the probe
|
|
160
|
+
should have low R² and ΔR² should be near zero.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
model_name: Model name (for logging purposes)
|
|
164
|
+
layer_idx: Layer index (for logging purposes)
|
|
165
|
+
X: Activations of shape (n_samples, d_model)
|
|
166
|
+
y: Labels of shape (n_samples,)
|
|
167
|
+
alpha: Ridge regression regularization parameter
|
|
168
|
+
test_size: Fraction of data for testing
|
|
169
|
+
random_state: Random seed for reproducibility
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Dictionary with mean_delta_r2, std_delta_r2, mean_probe_r2
|
|
173
|
+
"""
|
|
174
|
+
# For untrained model control, we use random activations
|
|
175
|
+
# to simulate what an untrained model would produce
|
|
176
|
+
np.random.seed(random_state)
|
|
177
|
+
X_random = np.random.randn(*X.shape)
|
|
178
|
+
|
|
179
|
+
# Train probe on random activations
|
|
180
|
+
trainer = ProbeTrainer(alpha=alpha, test_size=test_size, random_state=random_state)
|
|
181
|
+
trainer.fit(X_random, y)
|
|
182
|
+
|
|
183
|
+
# Evaluate pre-inversion
|
|
184
|
+
train_r2, test_r2 = trainer.evaluate_test_set()
|
|
185
|
+
r2_pre = test_r2
|
|
186
|
+
|
|
187
|
+
# Extract probe direction
|
|
188
|
+
probe_direction = trainer.get_probe_direction()
|
|
189
|
+
|
|
190
|
+
# Apply reflection to test set
|
|
191
|
+
X_test_inv = HyperplaneReflector.reflect(trainer.X_test, probe_direction)
|
|
192
|
+
|
|
193
|
+
# Evaluate post-inversion
|
|
194
|
+
r2_post = trainer.evaluate(X_test_inv, trainer.y_test)
|
|
195
|
+
|
|
196
|
+
# Compute ΔR²
|
|
197
|
+
delta_r2 = r2_post - r2_pre
|
|
198
|
+
|
|
199
|
+
return {
|
|
200
|
+
'mean_delta_r2': float(delta_r2),
|
|
201
|
+
'std_delta_r2': 0.0, # Single trial
|
|
202
|
+
'mean_probe_r2': float(r2_pre),
|
|
203
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Data models for LLM Polarity Inversion Validation experiments."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ExperimentResult:
|
|
9
|
+
"""Results from a single PID experiment.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
model_name: Name of the model (e.g., "gpt2", "gpt2-medium", "llama-3.2-1b")
|
|
13
|
+
direction: Semantic direction (e.g., "sentiment", "factual", "odd_even", "toxicity")
|
|
14
|
+
layer_idx: Absolute layer index
|
|
15
|
+
layer_depth_pct: Layer depth as percentage (0.0, 0.25, 0.50, 0.75, 1.0)
|
|
16
|
+
probe_r2: R² before inversion
|
|
17
|
+
probe_accuracy: Classification accuracy before inversion
|
|
18
|
+
post_inversion_r2: R² after inversion
|
|
19
|
+
post_inversion_accuracy: Classification accuracy after inversion
|
|
20
|
+
delta_r2: R²_post - R²_pre
|
|
21
|
+
top_token_change_rate: Fraction of inputs with changed top token
|
|
22
|
+
kl_divergence: Mean KL divergence
|
|
23
|
+
perplexity_delta: Perplexity change
|
|
24
|
+
n_samples: Dataset size
|
|
25
|
+
runtime_seconds: Experiment duration
|
|
26
|
+
"""
|
|
27
|
+
model_name: str
|
|
28
|
+
direction: str
|
|
29
|
+
layer_idx: int
|
|
30
|
+
layer_depth_pct: float
|
|
31
|
+
probe_r2: float
|
|
32
|
+
probe_accuracy: float
|
|
33
|
+
post_inversion_r2: float
|
|
34
|
+
post_inversion_accuracy: float
|
|
35
|
+
delta_r2: float
|
|
36
|
+
top_token_change_rate: float
|
|
37
|
+
kl_divergence: float
|
|
38
|
+
perplexity_delta: float
|
|
39
|
+
n_samples: int
|
|
40
|
+
runtime_seconds: float
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ControlResult:
|
|
45
|
+
"""Results from a control experiment.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
control_type: Type of control ("random_direction", "shuffled_labels", "untrained")
|
|
49
|
+
model_name: Name of the model
|
|
50
|
+
direction: Semantic direction
|
|
51
|
+
layer_idx: Absolute layer index
|
|
52
|
+
mean_delta_r2: Mean ΔR² across trials
|
|
53
|
+
std_delta_r2: Standard deviation of ΔR² across trials
|
|
54
|
+
mean_probe_r2: Mean probe R² across trials
|
|
55
|
+
n_trials: Number of trials
|
|
56
|
+
"""
|
|
57
|
+
control_type: str
|
|
58
|
+
model_name: str
|
|
59
|
+
direction: str
|
|
60
|
+
layer_idx: int
|
|
61
|
+
mean_delta_r2: float
|
|
62
|
+
std_delta_r2: float
|
|
63
|
+
mean_probe_r2: float
|
|
64
|
+
n_trials: int
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class TransferResult:
|
|
69
|
+
"""Results from cross-model transfer experiment.
|
|
70
|
+
|
|
71
|
+
Attributes:
|
|
72
|
+
source_model: Model probe was trained on
|
|
73
|
+
target_model: Model probe was tested on
|
|
74
|
+
direction: Semantic direction
|
|
75
|
+
layer_idx: Absolute layer index (matched by depth percentage)
|
|
76
|
+
source_r2: R² on source model
|
|
77
|
+
transfer_r2: R² on target model
|
|
78
|
+
transfer_ratio: transfer_r2 / source_r2
|
|
79
|
+
"""
|
|
80
|
+
source_model: str
|
|
81
|
+
target_model: str
|
|
82
|
+
direction: str
|
|
83
|
+
layer_idx: int
|
|
84
|
+
source_r2: float
|
|
85
|
+
transfer_r2: float
|
|
86
|
+
transfer_ratio: float
|