PyPI - rl-interrogate - Versions diffs - 0.1.0__tar.gz - Mend

rl-interrogate 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

rl_interrogate-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,22 @@
+# Python
+__pycache__/
+*.py[cod]
+# Environments
+/.venv
+/.env
+miniconda/
+miniconda3/
+Miniconda3-latest-Linux-x86_64.sh
+# Jupyter
+.ipynb_checkpoints/
+# OS
+.DS_Store
+Thumbs.db
+# IDEs
+.vscode/
+.idea/

rl_interrogate-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Ansh Arora
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

rl_interrogate-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,153 @@
+Metadata-Version: 2.4
+Name: rl-interrogate
+Version: 0.1.0
+Summary: Mechanistic interpretability toolkit for RL policies
+Project-URL: Homepage, https://github.com/aroransh/rl_interrogate
+Project-URL: Repository, https://github.com/aroransh/rl_interrogate
+Project-URL: Documentation, https://github.com/aroransh/rl_interrogate#readme
+Author: Ansh Arora
+License-Expression: MIT
+License-File: LICENSE
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9
+Requires-Dist: gymnasium
+Requires-Dist: matplotlib
+Requires-Dist: numpy
+Requires-Dist: scikit-learn
+Requires-Dist: seaborn
+Requires-Dist: stable-baselines3
+Requires-Dist: torch>=2.0
+Provides-Extra: dev
+Requires-Dist: hypothesis; extra == 'dev'
+Requires-Dist: pytest; extra == 'dev'
+Provides-Extra: mujoco
+Requires-Dist: mujoco; extra == 'mujoco'
+Description-Content-Type: text/markdown
+# rl_interrogate
+Mechanistic interpretability toolkit for RL policies. Probe, ablate, and interrogate
+what your policy has learned — not just how well it performs.
+## Installation
+```bash
+pip install -e .
+```
+Dependencies: `torch`, `numpy`, `scikit-learn`, `matplotlib`, `seaborn`, `gymnasium`,
+`stable-baselines3`. MuJoCo environments require the `mujoco` extra:
+```bash
+pip install -e ".[mujoco]"
+```
+## Minimal Example
+Load a checkpoint, run a probe, run ablation:
+```python
+import torch
+import numpy as np
+from rl_interrogate import LinearProbe, AblationHook
+# 1. Load your policy network (any torch.nn.Sequential)
+policy_net = torch.load("my_policy.pt")
+policy_net.eval()
+# 2. Build a synthetic observation grid
+obs_grid = np.random.randn(500, 28).astype(np.float32)
+labels = obs_grid[:, 10]  # probe for lateral position
+# 3. Linear probe at layer 5
+probe = LinearProbe()
+probe.fit(policy_net, layer_idx=5, obs_dataset=obs_grid, labels=labels)
+print(f"Layer 5 R² = {probe.score():.4f}")
+# 4. Ablation: zero the probe direction, measure performance change
+hook = AblationHook(policy_net, layer_idx=5, direction=probe._probe.coef_)
+with hook.apply(alpha=0.0):
+    # run your environment here — the probe direction is zeroed
+    pass
+```
+## Experiments
+The library was developed for the WakeRider paper (TMLR submission). Key experiments:
+- **Formation flight probe** (`examples/formation_flight_probe.py`): Reproduces
+  Actor L5 R²=0.973 from the seed-42 checkpoint.
+- **HalfCheetah ablation** (`examples/halfcheetah_ablation.py`): Runs ablation on
+  a HalfCheetah-v4 policy, showing PC1 ablation degrades performance by ~10%.
+## API Reference
+### Probing
+```python
+from rl_interrogate import LinearProbe, MLPProbe, LassoProbe
+# Ridge regression probe (recommended)
+probe = LinearProbe()
+probe.fit(model, layer_idx=5, obs_dataset=obs, labels=y)
+r2 = probe.score()
+# MLP probe (non-linear)
+mlp_probe = MLPProbe()
+mlp_probe.fit(model, layer_idx=5, obs_dataset=obs, labels=y)
+# Sparse Lasso probe
+lasso = LassoProbe()
+lasso.fit(model, layer_idx=5, obs_dataset=obs, labels=y)
+r2, n_nonzero = lasso.score()
+```
+### Ablation
+```python
+from rl_interrogate import AblationHook
+hook = AblationHook(policy_net, layer_idx=5, direction=probe_direction)
+with hook.apply(alpha=0.0):   # alpha=0 zeros the direction
+    rewards = run_episodes(model, env, n=100)
+```
+### PCA Utilities
+```python
+from rl_interrogate import fit_pca, project_subspace
+pca = fit_pca(activations, n_components=20)
+acts_k = project_subspace(activations, pca, k=1)  # rank-1 projection
+```
+### Visualization
+```python
+from rl_interrogate import plot_probe_heatmap, plot_ablation_curve
+plot_probe_heatmap(activations, labels, title="Layer 5 probe")
+plot_ablation_curve(alphas=[0.0, 0.5, 1.0], means=[1.05, 0.97, 0.90])
+```
+## Running Tests
+```bash
+pytest rl_interrogate/tests/ -v
+```
+## Link to Paper
+This library implements the interrogation protocol described in:
+> *WakeRider: Emergent V-Formation Flight via Wake Exploitation*
+> Section 3.3: The rl_interrogate Library
+The protocol consists of four steps:
+1. **Linear probing** — fit Ridge regression from hidden activations to a field label
+2. **Polarity inversion** — negate the sensor; verify R² drops (causal, not correlational)
+3. **Single-direction ablation** — zero the probe direction; measure performance change
+4. **Subspace variance** — greedy PCA selection to find the minimal sufficient subspace

rl_interrogate-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,124 @@
+# rl_interrogate
+Mechanistic interpretability toolkit for RL policies. Probe, ablate, and interrogate
+what your policy has learned — not just how well it performs.
+## Installation
+```bash
+pip install -e .
+```
+Dependencies: `torch`, `numpy`, `scikit-learn`, `matplotlib`, `seaborn`, `gymnasium`,
+`stable-baselines3`. MuJoCo environments require the `mujoco` extra:
+```bash
+pip install -e ".[mujoco]"
+```
+## Minimal Example
+Load a checkpoint, run a probe, run ablation:
+```python
+import torch
+import numpy as np
+from rl_interrogate import LinearProbe, AblationHook
+# 1. Load your policy network (any torch.nn.Sequential)
+policy_net = torch.load("my_policy.pt")
+policy_net.eval()
+# 2. Build a synthetic observation grid
+obs_grid = np.random.randn(500, 28).astype(np.float32)
+labels = obs_grid[:, 10]  # probe for lateral position
+# 3. Linear probe at layer 5
+probe = LinearProbe()
+probe.fit(policy_net, layer_idx=5, obs_dataset=obs_grid, labels=labels)
+print(f"Layer 5 R² = {probe.score():.4f}")
+# 4. Ablation: zero the probe direction, measure performance change
+hook = AblationHook(policy_net, layer_idx=5, direction=probe._probe.coef_)
+with hook.apply(alpha=0.0):
+    # run your environment here — the probe direction is zeroed
+    pass
+```
+## Experiments
+The library was developed for the WakeRider paper (TMLR submission). Key experiments:
+- **Formation flight probe** (`examples/formation_flight_probe.py`): Reproduces
+  Actor L5 R²=0.973 from the seed-42 checkpoint.
+- **HalfCheetah ablation** (`examples/halfcheetah_ablation.py`): Runs ablation on
+  a HalfCheetah-v4 policy, showing PC1 ablation degrades performance by ~10%.
+## API Reference
+### Probing
+```python
+from rl_interrogate import LinearProbe, MLPProbe, LassoProbe
+# Ridge regression probe (recommended)
+probe = LinearProbe()
+probe.fit(model, layer_idx=5, obs_dataset=obs, labels=y)
+r2 = probe.score()
+# MLP probe (non-linear)
+mlp_probe = MLPProbe()
+mlp_probe.fit(model, layer_idx=5, obs_dataset=obs, labels=y)
+# Sparse Lasso probe
+lasso = LassoProbe()
+lasso.fit(model, layer_idx=5, obs_dataset=obs, labels=y)
+r2, n_nonzero = lasso.score()
+```
+### Ablation
+```python
+from rl_interrogate import AblationHook
+hook = AblationHook(policy_net, layer_idx=5, direction=probe_direction)
+with hook.apply(alpha=0.0):   # alpha=0 zeros the direction
+    rewards = run_episodes(model, env, n=100)
+```
+### PCA Utilities
+```python
+from rl_interrogate import fit_pca, project_subspace
+pca = fit_pca(activations, n_components=20)
+acts_k = project_subspace(activations, pca, k=1)  # rank-1 projection
+```
+### Visualization
+```python
+from rl_interrogate import plot_probe_heatmap, plot_ablation_curve
+plot_probe_heatmap(activations, labels, title="Layer 5 probe")
+plot_ablation_curve(alphas=[0.0, 0.5, 1.0], means=[1.05, 0.97, 0.90])
+```
+## Running Tests
+```bash
+pytest rl_interrogate/tests/ -v
+```
+## Link to Paper
+This library implements the interrogation protocol described in:
+> *WakeRider: Emergent V-Formation Flight via Wake Exploitation*
+> Section 3.3: The rl_interrogate Library
+The protocol consists of four steps:
+1. **Linear probing** — fit Ridge regression from hidden activations to a field label
+2. **Polarity inversion** — negate the sensor; verify R² drops (causal, not correlational)
+3. **Single-direction ablation** — zero the probe direction; measure performance change
+4. **Subspace variance** — greedy PCA selection to find the minimal sufficient subspace

rl_interrogate-0.1.0/configs/default_probe_config.yaml ADDED Viewed

@@ -0,0 +1,9 @@
+probe_alpha: 1.0
+test_split: 0.2
+random_state: 42
+n_boot: 1000
+pca_k:
+  - 1
+  - 5
+  - 10
+  - 20

rl_interrogate-0.1.0/examples/formation_flight_probe.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""Example: Actor L5 probe on formation flight seed-42 checkpoint.
+Demonstrates reproducing R²=0.9730 from the seed-42 checkpoint using
+LinearProbe from rl_interrogate.
+Usage::
+    python rl_interrogate/examples/formation_flight_probe.py
+"""
+from __future__ import annotations
+import os
+import sys
+import numpy as np
+import torch
+import torch.nn as nn
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
+from rl_interrogate import LinearProbe
+# ── Constants ─────────────────────────────────────────────────────────────────
+CHECKPOINT_PATH = "runs/formation/formation_seed_42/checkpoints/best_agent.pt"
+LAYER_IDX = 5
+EXPECTED_R2 = 0.9730
+TOLERANCE = 0.01
+OBS_PER_FOLLOWER = 28
+ACT_PER_FOLLOWER = 4
+DRONE_WINGSPAN_M = 1.5
+DRONE_WEIGHT_N = 2.0 * 9.81
+AIRSPEED_MS = 15.0
+TARGET_ALTITUDE = 10.0
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def _build_actor_net(obs_dim: int, state_dict: dict) -> nn.Sequential:
+    net = nn.Sequential(
+        nn.Linear(obs_dim, 256), nn.ELU(),
+        nn.Linear(256, 256),     nn.ELU(),
+        nn.Linear(256, 128),     nn.ELU(),
+        nn.Linear(128, ACT_PER_FOLLOWER),
+    ).to(DEVICE)
+    net_sd = {k[len("net."):]: v for k, v in state_dict.items() if k.startswith("net.")}
+    if net_sd:
+        net.load_state_dict(net_sd, strict=True)
+    net.eval()
+    return net
+def _build_obs_grid(obs_dim: int, raw_ckpt: dict):
+    """Build the 80×120 synthetic observation grid and true upwash labels."""
+    try:
+        from source.drone_formation.physics.wake_model_gpu import compute_upwash_field_batched
+    except ImportError:
+        raise ImportError(
+            "source.drone_formation is required to build the obs grid. "
+            "Run this script from the workspace root."
+        )
+    X_RANGE = np.linspace(-6.0, 0.0, 80)
+    Y_RANGE = np.linspace(-3.0, 3.0, 120)
+    Z_VAL = TARGET_ALTITUDE
+    lead_pos_np = np.array([0.0, 0.0, Z_VAL])
+    grid_x, grid_y = np.meshgrid(X_RANGE, Y_RANGE, indexing="ij")
+    N = grid_x.size
+    foll_pos_t = torch.tensor(
+        np.stack([
+            grid_x.ravel().astype(np.float32),
+            grid_y.ravel().astype(np.float32),
+            np.full(N, Z_VAL, dtype=np.float32),
+        ], axis=1),
+        device=DEVICE,
+    ).unsqueeze(1)
+    lead_pos_t = torch.tensor(lead_pos_np, dtype=torch.float32, device=DEVICE).unsqueeze(0).expand(N, -1)
+    with torch.no_grad():
+        upwash_t = compute_upwash_field_batched(
+            foll_pos_t, lead_pos_t,
+            torch.full((N,), DRONE_WINGSPAN_M, device=DEVICE),
+            torch.full((N,), DRONE_WEIGHT_N, device=DEVICE),
+            airspeed=AIRSPEED_MS, device=DEVICE,
+        )
+    true_upwash = upwash_t[:, 0].cpu().numpy()
+    # Load scaler stats
+    scaler_mean = scaler_var = None
+    if "state_preprocessor" in raw_ckpt:
+        sp = raw_ckpt["state_preprocessor"]
+        if "running_mean" in sp and "running_variance" in sp:
+            scaler_mean = sp["running_mean"][:obs_dim].to(DEVICE)
+            scaler_var = sp["running_variance"][:obs_dim].to(DEVICE)
+    def _normalize(x: torch.Tensor) -> torch.Tensor:
+        if scaler_mean is not None:
+            return ((x - scaler_mean) / (torch.sqrt(scaler_var) + 1e-8)).float()
+        return x.float()
+    obs_batch = torch.zeros(N, obs_dim, dtype=torch.float32, device=DEVICE)
+    obs_batch[:, 9]  = foll_pos_t[:, 0, 0] - lead_pos_t[:, 0]
+    obs_batch[:, 10] = foll_pos_t[:, 0, 1] - lead_pos_t[:, 1]
+    obs_batch[:, 11] = foll_pos_t[:, 0, 2] - lead_pos_t[:, 2]
+    obs_batch[:, 14] = foll_pos_t[:, 0, 0]
+    obs_batch[:, 15] = foll_pos_t[:, 0, 1]
+    obs_batch[:, 16] = foll_pos_t[:, 0, 2]
+    obs_norm = _normalize(obs_batch)
+    return obs_norm, true_upwash
+def main():
+    # ── Load checkpoint ───────────────────────────────────────────────────────
+    if not os.path.exists(CHECKPOINT_PATH):
+        raise FileNotFoundError(
+            f"Checkpoint not found: {CHECKPOINT_PATH}\n"
+            "Please ensure the formation flight training has completed for seed 42.\n"
+            "Run: python scripts/train_sac.py --seed 42  (or the PPO training script)"
+        )
+    print(f"Loading checkpoint: {CHECKPOINT_PATH}")
+    raw_ckpt = torch.load(CHECKPOINT_PATH, map_location=DEVICE, weights_only=False)
+    policy_sd = raw_ckpt.get("policy", raw_ckpt)
+    # Detect obs dim from checkpoint
+    w0 = policy_sd.get("net.0.weight")
+    obs_dim = int(w0.shape[1]) if w0 is not None else OBS_PER_FOLLOWER
+    print(f"Detected obs dim: {obs_dim}")
+    # ── Build actor network ───────────────────────────────────────────────────
+    actor_net = _build_actor_net(obs_dim, policy_sd)
+    # ── Build obs grid ────────────────────────────────────────────────────────
+    print("Building 80×120 observation grid (9600 points)...")
+    obs_norm, true_upwash = _build_obs_grid(obs_dim, raw_ckpt)
+    # ── Run LinearProbe at Actor L5 ───────────────────────────────────────────
+    print(f"Running LinearProbe at layer {LAYER_IDX}...")
+    probe = LinearProbe()
+    probe.fit(actor_net, LAYER_IDX, obs_norm, true_upwash, test_size=0.2, seed=42)
+    r2 = probe.score()
+    print(f"\nActor L5 Ridge R² = {r2:.4f}")
+    print(f"Expected R²       = {EXPECTED_R2:.4f}")
+    print(f"Difference        = {abs(r2 - EXPECTED_R2):.4f}")
+    if abs(r2 - EXPECTED_R2) <= TOLERANCE:
+        print(f"✓ R² matches expected value within tolerance ({TOLERANCE})")
+    else:
+        print(f"✗ WARNING: R² differs from expected by more than {TOLERANCE}")
+    return r2
+if __name__ == "__main__":
+    main()