ai-metacognition-toolkit 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. ai_metacognition/__init__.py +123 -0
  2. ai_metacognition/analyzers/__init__.py +24 -0
  3. ai_metacognition/analyzers/base.py +39 -0
  4. ai_metacognition/analyzers/counterfactual_cot.py +579 -0
  5. ai_metacognition/analyzers/model_api.py +39 -0
  6. ai_metacognition/detectors/__init__.py +40 -0
  7. ai_metacognition/detectors/base.py +42 -0
  8. ai_metacognition/detectors/observer_effect.py +651 -0
  9. ai_metacognition/detectors/sandbagging_detector.py +1438 -0
  10. ai_metacognition/detectors/situational_awareness.py +526 -0
  11. ai_metacognition/integrations/__init__.py +16 -0
  12. ai_metacognition/integrations/anthropic_api.py +230 -0
  13. ai_metacognition/integrations/base.py +113 -0
  14. ai_metacognition/integrations/openai_api.py +300 -0
  15. ai_metacognition/probing/__init__.py +24 -0
  16. ai_metacognition/probing/extraction.py +176 -0
  17. ai_metacognition/probing/hooks.py +200 -0
  18. ai_metacognition/probing/probes.py +186 -0
  19. ai_metacognition/probing/vectors.py +133 -0
  20. ai_metacognition/utils/__init__.py +48 -0
  21. ai_metacognition/utils/feature_extraction.py +534 -0
  22. ai_metacognition/utils/statistical_tests.py +317 -0
  23. ai_metacognition/utils/text_processing.py +98 -0
  24. ai_metacognition/visualizations/__init__.py +22 -0
  25. ai_metacognition/visualizations/plotting.py +523 -0
  26. ai_metacognition_toolkit-0.3.0.dist-info/METADATA +621 -0
  27. ai_metacognition_toolkit-0.3.0.dist-info/RECORD +30 -0
  28. ai_metacognition_toolkit-0.3.0.dist-info/WHEEL +5 -0
  29. ai_metacognition_toolkit-0.3.0.dist-info/licenses/LICENSE +21 -0
  30. ai_metacognition_toolkit-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,230 @@
1
+ """Anthropic Claude API integration for sandbagging detection.
2
+
3
+ This module provides an implementation of the ModelAPI protocol
4
+ for Anthropic's Claude models, enabling sandbagging detection
5
+ experiments with Claude.
6
+
7
+ Requires the anthropic package: pip install anthropic
8
+ Set ANTHROPIC_API_KEY environment variable for authentication.
9
+ """
10
+
11
+ import os
12
+ import random
13
+ import time
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from ai_metacognition.integrations.base import ModelAPI, ModelResponse
17
+
18
+
19
+ class AnthropicModelAPI:
20
+ """Anthropic Claude API implementation.
21
+
22
+ This class provides integration with Anthropic's Claude models
23
+ for use in sandbagging detection experiments.
24
+
25
+ Attributes:
26
+ model: The Claude model to use (e.g., "claude-3-opus-20240229")
27
+ max_tokens: Maximum tokens to generate
28
+ temperature: Sampling temperature
29
+
30
+ Example:
31
+ >>> api = AnthropicModelAPI(model="claude-3-sonnet-20240229")
32
+ >>> response = api.generate("What is 2+2?")
33
+ >>> print(response)
34
+ "The answer is 4."
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ model: str = "claude-sonnet-4-20250514",
40
+ max_tokens: int = 1024,
41
+ temperature: float = 0.7,
42
+ api_key: Optional[str] = None,
43
+ ) -> None:
44
+ """Initialize Anthropic API client.
45
+
46
+ Args:
47
+ model: Claude model identifier
48
+ max_tokens: Maximum tokens to generate
49
+ temperature: Sampling temperature (0.0-1.0)
50
+ api_key: API key (defaults to ANTHROPIC_API_KEY env var)
51
+
52
+ Raises:
53
+ ImportError: If anthropic package is not installed
54
+ ValueError: If no API key is provided or found
55
+ """
56
+ self.model = model
57
+ self.max_tokens = max_tokens
58
+ self.temperature = temperature
59
+
60
+ # Get API key
61
+ self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
62
+ if not self._api_key:
63
+ raise ValueError(
64
+ "Anthropic API key required. Set ANTHROPIC_API_KEY environment "
65
+ "variable or pass api_key parameter."
66
+ )
67
+
68
+ # Lazy import to avoid dependency issues
69
+ self._client: Optional[Any] = None
70
+
71
+ def _get_client(self) -> Any:
72
+ """Get or create Anthropic client (lazy init)."""
73
+ if self._client is None:
74
+ try:
75
+ import anthropic
76
+ self._client = anthropic.Anthropic(api_key=self._api_key)
77
+ except ImportError:
78
+ raise ImportError(
79
+ "anthropic package required. Install with: pip install anthropic"
80
+ )
81
+ return self._client
82
+
83
+ def generate(self, prompt: str, **kwargs: Any) -> str:
84
+ """Generate a response from Claude.
85
+
86
+ Args:
87
+ prompt: The input prompt
88
+ **kwargs: Additional parameters (temperature, max_tokens, system)
89
+
90
+ Returns:
91
+ The model's text response
92
+ """
93
+ client = self._get_client()
94
+
95
+ # Extract parameters with defaults
96
+ temperature = kwargs.get("temperature", self.temperature)
97
+ max_tokens = kwargs.get("max_tokens", self.max_tokens)
98
+ system = kwargs.get("system", "You are a helpful assistant.")
99
+
100
+ start_time = time.time()
101
+
102
+ response = client.messages.create(
103
+ model=self.model,
104
+ max_tokens=max_tokens,
105
+ temperature=temperature,
106
+ system=system,
107
+ messages=[{"role": "user", "content": prompt}],
108
+ )
109
+
110
+ latency_ms = (time.time() - start_time) * 1000
111
+
112
+ # Extract text from response
113
+ text = response.content[0].text if response.content else ""
114
+
115
+ return text
116
+
117
+ def generate_with_perturbation(
118
+ self, prompt: str, noise_level: float = 0.1, **kwargs: Any
119
+ ) -> str:
120
+ """Generate a response with noise injection.
121
+
122
+ Applies perturbation through:
123
+ 1. Temperature scaling based on noise level
124
+ 2. Optional prompt perturbation
125
+
126
+ Args:
127
+ prompt: The input prompt
128
+ noise_level: Perturbation level (0.0-1.0)
129
+ **kwargs: Additional generation parameters
130
+
131
+ Returns:
132
+ The perturbed response
133
+ """
134
+ # Scale temperature with noise level
135
+ base_temp = kwargs.get("temperature", self.temperature)
136
+ perturbed_temp = min(base_temp + noise_level * 0.5, 1.0)
137
+
138
+ # Apply prompt perturbation at higher noise levels
139
+ perturbed_prompt = prompt
140
+ if noise_level > 0.3:
141
+ perturbed_prompt = self._perturb_prompt(prompt, noise_level)
142
+
143
+ return self.generate(
144
+ perturbed_prompt,
145
+ temperature=perturbed_temp,
146
+ **{k: v for k, v in kwargs.items() if k != "temperature"},
147
+ )
148
+
149
+ def _perturb_prompt(self, prompt: str, noise_level: float) -> str:
150
+ """Add slight noise to prompt to disrupt sandbagging."""
151
+ # HACK: these perturbations are pretty crude, but they work ok
152
+ # Simple perturbation: add subtle variations
153
+ perturbations = [
154
+ lambda p: p.replace(".", ".."),
155
+ lambda p: "Um, " + p,
156
+ lambda p: p + " Please respond.",
157
+ lambda p: "I was wondering, " + p.lower(),
158
+ ]
159
+
160
+ # Apply perturbation based on noise level
161
+ if random.random() < noise_level:
162
+ perturbation = random.choice(perturbations)
163
+ return perturbation(prompt)
164
+
165
+ return prompt
166
+
167
+ def generate_batch(self, prompts: List[str], **kwargs: Any) -> List[str]:
168
+ """Generate responses for multiple prompts."""
169
+ # NOTE: could parallelize this but sequential is fine for now
170
+ return [self.generate(prompt, **kwargs) for prompt in prompts]
171
+
172
+ def get_model_info(self) -> Dict[str, Any]:
173
+ """Get information about the configured model.
174
+
175
+ Returns:
176
+ Dictionary with model metadata
177
+ """
178
+ return {
179
+ "provider": "anthropic",
180
+ "model": self.model,
181
+ "max_tokens": self.max_tokens,
182
+ "temperature": self.temperature,
183
+ "capabilities": ["text_generation", "analysis", "coding"],
184
+ }
185
+
186
+ def generate_with_response(
187
+ self, prompt: str, **kwargs: Any
188
+ ) -> ModelResponse:
189
+ """Generate a response with full metadata.
190
+
191
+ Args:
192
+ prompt: The input prompt
193
+ **kwargs: Additional parameters
194
+
195
+ Returns:
196
+ ModelResponse with text and metadata
197
+ """
198
+ client = self._get_client()
199
+
200
+ temperature = kwargs.get("temperature", self.temperature)
201
+ max_tokens = kwargs.get("max_tokens", self.max_tokens)
202
+ system = kwargs.get("system", "You are a helpful assistant.")
203
+
204
+ start_time = time.time()
205
+
206
+ response = client.messages.create(
207
+ model=self.model,
208
+ max_tokens=max_tokens,
209
+ temperature=temperature,
210
+ system=system,
211
+ messages=[{"role": "user", "content": prompt}],
212
+ )
213
+
214
+ latency_ms = (time.time() - start_time) * 1000
215
+
216
+ text = response.content[0].text if response.content else ""
217
+
218
+ return ModelResponse(
219
+ text=text,
220
+ model=self.model,
221
+ usage={
222
+ "input_tokens": response.usage.input_tokens,
223
+ "output_tokens": response.usage.output_tokens,
224
+ },
225
+ latency_ms=latency_ms,
226
+ metadata={
227
+ "stop_reason": response.stop_reason,
228
+ "model": response.model,
229
+ },
230
+ )
@@ -0,0 +1,113 @@
1
+ """Base classes and protocols for model API integrations.
2
+
3
+ This module defines the interface that all model API implementations
4
+ must follow for compatibility with the sandbagging detection framework.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
9
+
10
+
11
+ @dataclass
12
+ class ModelResponse:
13
+ """Response from a model API call.
14
+
15
+ Attributes:
16
+ text: The generated text response
17
+ model: The model identifier used
18
+ usage: Token usage statistics
19
+ latency_ms: Response latency in milliseconds
20
+ metadata: Additional provider-specific metadata
21
+ """
22
+
23
+ text: str
24
+ model: str
25
+ usage: Dict[str, int] = field(default_factory=dict)
26
+ latency_ms: float = 0.0
27
+ metadata: Dict[str, Any] = field(default_factory=dict)
28
+
29
+
30
+ @runtime_checkable
31
+ class ModelAPI(Protocol):
32
+ """Protocol defining the interface for model API integrations.
33
+
34
+ All model API implementations should conform to this protocol
35
+ to ensure compatibility with the sandbagging detection framework.
36
+
37
+ The protocol defines two main methods:
38
+ - generate: Standard text generation
39
+ - generate_with_perturbation: Generation with noise injection
40
+
41
+ Example:
42
+ >>> class MyModelAPI:
43
+ ... def generate(self, prompt: str, **kwargs) -> str:
44
+ ... # Implementation here
45
+ ... return "response"
46
+ ...
47
+ ... def generate_with_perturbation(
48
+ ... self, prompt: str, noise_level: float = 0.1, **kwargs
49
+ ... ) -> str:
50
+ ... # Implementation with perturbation
51
+ ... return "perturbed response"
52
+ """
53
+
54
+ def generate(self, prompt: str, **kwargs: Any) -> str:
55
+ """Generate a response from the model.
56
+
57
+ Args:
58
+ prompt: The input prompt to send to the model
59
+ **kwargs: Additional generation parameters (temperature, max_tokens, etc.)
60
+
61
+ Returns:
62
+ The model's text response
63
+ """
64
+ ...
65
+
66
+ def generate_with_perturbation(
67
+ self, prompt: str, noise_level: float = 0.1, **kwargs: Any
68
+ ) -> str:
69
+ """Generate a response with noise injection.
70
+
71
+ This method is used for noise injection probing in sandbagging detection.
72
+ Implementations should apply perturbation at the specified level.
73
+
74
+ Perturbation strategies may include:
75
+ - Temperature scaling
76
+ - Prompt perturbation (typos, rephrasing)
77
+ - Token sampling modifications
78
+ - Logit noise injection (if accessible)
79
+
80
+ Args:
81
+ prompt: The input prompt
82
+ noise_level: Level of perturbation (0.0 = none, 1.0 = maximum)
83
+ **kwargs: Additional generation parameters
84
+
85
+ Returns:
86
+ The model's response with perturbation applied
87
+ """
88
+ ...
89
+
90
+ def generate_batch(
91
+ self, prompts: List[str], **kwargs: Any
92
+ ) -> List[str]:
93
+ """Generate responses for multiple prompts.
94
+
95
+ Default implementation calls generate() for each prompt.
96
+ Implementations may override for batch optimization.
97
+
98
+ Args:
99
+ prompts: List of input prompts
100
+ **kwargs: Additional generation parameters
101
+
102
+ Returns:
103
+ List of model responses
104
+ """
105
+ ...
106
+
107
+ def get_model_info(self) -> Dict[str, Any]:
108
+ """Get information about the configured model.
109
+
110
+ Returns:
111
+ Dictionary with model metadata (name, version, capabilities)
112
+ """
113
+ ...
@@ -0,0 +1,300 @@
1
+ """OpenAI API integration for sandbagging detection.
2
+
3
+ This module provides an implementation of the ModelAPI protocol
4
+ for OpenAI's GPT models, enabling sandbagging detection experiments.
5
+
6
+ Requires the openai package: pip install openai
7
+ Set OPENAI_API_KEY environment variable for authentication.
8
+ """
9
+
10
+ import os
11
+ import random
12
+ import time
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from ai_metacognition.integrations.base import ModelAPI, ModelResponse
16
+
17
+
18
+ class OpenAIModelAPI:
19
+ """OpenAI GPT API implementation.
20
+
21
+ This class provides integration with OpenAI's GPT models
22
+ for use in sandbagging detection experiments.
23
+
24
+ Attributes:
25
+ model: The GPT model to use (e.g., "gpt-4", "gpt-4-turbo")
26
+ max_tokens: Maximum tokens to generate
27
+ temperature: Sampling temperature
28
+
29
+ Example:
30
+ >>> api = OpenAIModelAPI(model="gpt-4")
31
+ >>> response = api.generate("What is 2+2?")
32
+ >>> print(response)
33
+ "The answer is 4."
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ model: str = "gpt-4",
39
+ max_tokens: int = 1024,
40
+ temperature: float = 0.7,
41
+ api_key: Optional[str] = None,
42
+ ) -> None:
43
+ """Initialize OpenAI API client.
44
+
45
+ Args:
46
+ model: GPT model identifier
47
+ max_tokens: Maximum tokens to generate
48
+ temperature: Sampling temperature (0.0-2.0)
49
+ api_key: API key (defaults to OPENAI_API_KEY env var)
50
+
51
+ Raises:
52
+ ImportError: If openai package is not installed
53
+ ValueError: If no API key is provided or found
54
+ """
55
+ self.model = model
56
+ self.max_tokens = max_tokens
57
+ self.temperature = temperature
58
+
59
+ # Get API key
60
+ self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
61
+ if not self._api_key:
62
+ raise ValueError(
63
+ "OpenAI API key required. Set OPENAI_API_KEY environment "
64
+ "variable or pass api_key parameter."
65
+ )
66
+
67
+ # Lazy import to avoid dependency issues
68
+ self._client: Optional[Any] = None
69
+
70
+ def _get_client(self) -> Any:
71
+ """Get or create OpenAI client (lazy init)."""
72
+ if self._client is None:
73
+ try:
74
+ import openai
75
+ self._client = openai.OpenAI(api_key=self._api_key)
76
+ except ImportError:
77
+ raise ImportError(
78
+ "openai package required. Install with: pip install openai"
79
+ )
80
+ return self._client
81
+
82
+ def generate(self, prompt: str, **kwargs: Any) -> str:
83
+ """Generate a response from GPT.
84
+
85
+ Args:
86
+ prompt: The input prompt
87
+ **kwargs: Additional parameters (temperature, max_tokens, system)
88
+
89
+ Returns:
90
+ The model's text response
91
+ """
92
+ client = self._get_client()
93
+
94
+ # Extract parameters with defaults
95
+ temperature = kwargs.get("temperature", self.temperature)
96
+ max_tokens = kwargs.get("max_tokens", self.max_tokens)
97
+ system = kwargs.get("system", "You are a helpful assistant.")
98
+
99
+ messages = [
100
+ {"role": "system", "content": system},
101
+ {"role": "user", "content": prompt},
102
+ ]
103
+
104
+ start_time = time.time()
105
+
106
+ response = client.chat.completions.create(
107
+ model=self.model,
108
+ messages=messages,
109
+ max_tokens=max_tokens,
110
+ temperature=temperature,
111
+ )
112
+
113
+ latency_ms = (time.time() - start_time) * 1000
114
+
115
+ # Extract text from response
116
+ text = response.choices[0].message.content if response.choices else ""
117
+
118
+ return text or ""
119
+
120
+ def generate_with_perturbation(
121
+ self, prompt: str, noise_level: float = 0.1, **kwargs: Any
122
+ ) -> str:
123
+ """Generate a response with noise injection.
124
+
125
+ Applies perturbation through:
126
+ 1. Temperature scaling based on noise level
127
+ 2. Optional prompt perturbation
128
+ 3. Top-p sampling adjustment
129
+
130
+ Args:
131
+ prompt: The input prompt
132
+ noise_level: Perturbation level (0.0-1.0)
133
+ **kwargs: Additional generation parameters
134
+
135
+ Returns:
136
+ The perturbed response
137
+ """
138
+ # Scale temperature with noise level
139
+ base_temp = kwargs.get("temperature", self.temperature)
140
+ perturbed_temp = min(base_temp + noise_level * 0.8, 2.0)
141
+
142
+ # Adjust top_p for additional randomness
143
+ top_p = max(0.5, 1.0 - noise_level * 0.3)
144
+
145
+ # Apply prompt perturbation at higher noise levels
146
+ perturbed_prompt = prompt
147
+ if noise_level > 0.3:
148
+ perturbed_prompt = self._perturb_prompt(prompt, noise_level)
149
+
150
+ return self.generate(
151
+ perturbed_prompt,
152
+ temperature=perturbed_temp,
153
+ top_p=top_p,
154
+ **{k: v for k, v in kwargs.items() if k not in ["temperature", "top_p"]},
155
+ )
156
+
157
+ def _perturb_prompt(self, prompt: str, noise_level: float) -> str:
158
+ """Add slight noise to prompt."""
159
+ perturbations = [
160
+ lambda p: p.replace(".", ".."),
161
+ lambda p: "Hey, " + p,
162
+ lambda p: p + " Be concise.",
163
+ lambda p: "Quick question: " + p.lower(),
164
+ lambda p: p.replace("?", "??"),
165
+ ]
166
+
167
+ if random.random() < noise_level:
168
+ perturbation = random.choice(perturbations)
169
+ return perturbation(prompt)
170
+
171
+ return prompt
172
+
173
+ def generate_batch(self, prompts: List[str], **kwargs: Any) -> List[str]:
174
+ """Generate responses for multiple prompts."""
175
+ return [self.generate(prompt, **kwargs) for prompt in prompts]
176
+
177
+ def get_model_info(self) -> Dict[str, Any]:
178
+ """Get information about the configured model.
179
+
180
+ Returns:
181
+ Dictionary with model metadata
182
+ """
183
+ return {
184
+ "provider": "openai",
185
+ "model": self.model,
186
+ "max_tokens": self.max_tokens,
187
+ "temperature": self.temperature,
188
+ "capabilities": ["text_generation", "analysis", "coding", "reasoning"],
189
+ }
190
+
191
+ def generate_with_response(
192
+ self, prompt: str, **kwargs: Any
193
+ ) -> ModelResponse:
194
+ """Generate a response with full metadata.
195
+
196
+ Args:
197
+ prompt: The input prompt
198
+ **kwargs: Additional parameters
199
+
200
+ Returns:
201
+ ModelResponse with text and metadata
202
+ """
203
+ client = self._get_client()
204
+
205
+ temperature = kwargs.get("temperature", self.temperature)
206
+ max_tokens = kwargs.get("max_tokens", self.max_tokens)
207
+ system = kwargs.get("system", "You are a helpful assistant.")
208
+
209
+ messages = [
210
+ {"role": "system", "content": system},
211
+ {"role": "user", "content": prompt},
212
+ ]
213
+
214
+ start_time = time.time()
215
+
216
+ response = client.chat.completions.create(
217
+ model=self.model,
218
+ messages=messages,
219
+ max_tokens=max_tokens,
220
+ temperature=temperature,
221
+ )
222
+
223
+ latency_ms = (time.time() - start_time) * 1000
224
+
225
+ text = response.choices[0].message.content if response.choices else ""
226
+
227
+ return ModelResponse(
228
+ text=text or "",
229
+ model=self.model,
230
+ usage={
231
+ "prompt_tokens": response.usage.prompt_tokens if response.usage else 0,
232
+ "completion_tokens": response.usage.completion_tokens if response.usage else 0,
233
+ "total_tokens": response.usage.total_tokens if response.usage else 0,
234
+ },
235
+ latency_ms=latency_ms,
236
+ metadata={
237
+ "finish_reason": response.choices[0].finish_reason if response.choices else None,
238
+ "model": response.model,
239
+ },
240
+ )
241
+
242
+ def generate_with_logprobs(
243
+ self, prompt: str, **kwargs: Any
244
+ ) -> Dict[str, Any]:
245
+ """Generate a response with token log probabilities.
246
+
247
+ Useful for analyzing model confidence and detecting
248
+ unusual token distributions that may indicate sandbagging.
249
+
250
+ Args:
251
+ prompt: The input prompt
252
+ **kwargs: Additional parameters
253
+
254
+ Returns:
255
+ Dictionary with text and log probabilities
256
+ """
257
+ client = self._get_client()
258
+
259
+ temperature = kwargs.get("temperature", self.temperature)
260
+ max_tokens = kwargs.get("max_tokens", self.max_tokens)
261
+ system = kwargs.get("system", "You are a helpful assistant.")
262
+
263
+ messages = [
264
+ {"role": "system", "content": system},
265
+ {"role": "user", "content": prompt},
266
+ ]
267
+
268
+ response = client.chat.completions.create(
269
+ model=self.model,
270
+ messages=messages,
271
+ max_tokens=max_tokens,
272
+ temperature=temperature,
273
+ logprobs=True,
274
+ top_logprobs=5,
275
+ )
276
+
277
+ choice = response.choices[0] if response.choices else None
278
+ text = choice.message.content if choice else ""
279
+
280
+ logprobs_data = None
281
+ if choice and choice.logprobs:
282
+ logprobs_data = {
283
+ "tokens": [
284
+ {
285
+ "token": lp.token,
286
+ "logprob": lp.logprob,
287
+ "top_logprobs": [
288
+ {"token": t.token, "logprob": t.logprob}
289
+ for t in (lp.top_logprobs or [])
290
+ ],
291
+ }
292
+ for lp in (choice.logprobs.content or [])
293
+ ],
294
+ }
295
+
296
+ return {
297
+ "text": text or "",
298
+ "logprobs": logprobs_data,
299
+ "model": response.model,
300
+ }
@@ -0,0 +1,24 @@
1
+ """Activation probing for sandbagging detection.
2
+
3
+ This module provides tools for analyzing model activations to detect
4
+ sandbagging behavior at the representation level.
5
+
6
+ Key components:
7
+ - ActivationHook: Capture hidden states during forward pass
8
+ - SteeringVector: Represent behavioral directions in activation space
9
+ - extract_caa_vector: Extract vectors using Contrastive Activation Addition
10
+ - LinearProbe: Train classifiers on activation patterns
11
+ """
12
+
13
+ from .hooks import ActivationHook
14
+ from .vectors import SteeringVector
15
+ from .extraction import extract_caa_vector, extract_activations
16
+ from .probes import LinearProbe
17
+
18
+ __all__ = [
19
+ "ActivationHook",
20
+ "SteeringVector",
21
+ "extract_caa_vector",
22
+ "extract_activations",
23
+ "LinearProbe",
24
+ ]