ai-metacognition-toolkit 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_metacognition/__init__.py +123 -0
- ai_metacognition/analyzers/__init__.py +24 -0
- ai_metacognition/analyzers/base.py +39 -0
- ai_metacognition/analyzers/counterfactual_cot.py +579 -0
- ai_metacognition/analyzers/model_api.py +39 -0
- ai_metacognition/detectors/__init__.py +40 -0
- ai_metacognition/detectors/base.py +42 -0
- ai_metacognition/detectors/observer_effect.py +651 -0
- ai_metacognition/detectors/sandbagging_detector.py +1438 -0
- ai_metacognition/detectors/situational_awareness.py +526 -0
- ai_metacognition/integrations/__init__.py +16 -0
- ai_metacognition/integrations/anthropic_api.py +230 -0
- ai_metacognition/integrations/base.py +113 -0
- ai_metacognition/integrations/openai_api.py +300 -0
- ai_metacognition/probing/__init__.py +24 -0
- ai_metacognition/probing/extraction.py +176 -0
- ai_metacognition/probing/hooks.py +200 -0
- ai_metacognition/probing/probes.py +186 -0
- ai_metacognition/probing/vectors.py +133 -0
- ai_metacognition/utils/__init__.py +48 -0
- ai_metacognition/utils/feature_extraction.py +534 -0
- ai_metacognition/utils/statistical_tests.py +317 -0
- ai_metacognition/utils/text_processing.py +98 -0
- ai_metacognition/visualizations/__init__.py +22 -0
- ai_metacognition/visualizations/plotting.py +523 -0
- ai_metacognition_toolkit-0.3.0.dist-info/METADATA +621 -0
- ai_metacognition_toolkit-0.3.0.dist-info/RECORD +30 -0
- ai_metacognition_toolkit-0.3.0.dist-info/WHEEL +5 -0
- ai_metacognition_toolkit-0.3.0.dist-info/licenses/LICENSE +21 -0
- ai_metacognition_toolkit-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Anthropic Claude API integration for sandbagging detection.
|
|
2
|
+
|
|
3
|
+
This module provides an implementation of the ModelAPI protocol
|
|
4
|
+
for Anthropic's Claude models, enabling sandbagging detection
|
|
5
|
+
experiments with Claude.
|
|
6
|
+
|
|
7
|
+
Requires the anthropic package: pip install anthropic
|
|
8
|
+
Set ANTHROPIC_API_KEY environment variable for authentication.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import random
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
from ai_metacognition.integrations.base import ModelAPI, ModelResponse
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AnthropicModelAPI:
|
|
20
|
+
"""Anthropic Claude API implementation.
|
|
21
|
+
|
|
22
|
+
This class provides integration with Anthropic's Claude models
|
|
23
|
+
for use in sandbagging detection experiments.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
model: The Claude model to use (e.g., "claude-3-opus-20240229")
|
|
27
|
+
max_tokens: Maximum tokens to generate
|
|
28
|
+
temperature: Sampling temperature
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> api = AnthropicModelAPI(model="claude-3-sonnet-20240229")
|
|
32
|
+
>>> response = api.generate("What is 2+2?")
|
|
33
|
+
>>> print(response)
|
|
34
|
+
"The answer is 4."
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
model: str = "claude-sonnet-4-20250514",
|
|
40
|
+
max_tokens: int = 1024,
|
|
41
|
+
temperature: float = 0.7,
|
|
42
|
+
api_key: Optional[str] = None,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""Initialize Anthropic API client.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
model: Claude model identifier
|
|
48
|
+
max_tokens: Maximum tokens to generate
|
|
49
|
+
temperature: Sampling temperature (0.0-1.0)
|
|
50
|
+
api_key: API key (defaults to ANTHROPIC_API_KEY env var)
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ImportError: If anthropic package is not installed
|
|
54
|
+
ValueError: If no API key is provided or found
|
|
55
|
+
"""
|
|
56
|
+
self.model = model
|
|
57
|
+
self.max_tokens = max_tokens
|
|
58
|
+
self.temperature = temperature
|
|
59
|
+
|
|
60
|
+
# Get API key
|
|
61
|
+
self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
62
|
+
if not self._api_key:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"Anthropic API key required. Set ANTHROPIC_API_KEY environment "
|
|
65
|
+
"variable or pass api_key parameter."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Lazy import to avoid dependency issues
|
|
69
|
+
self._client: Optional[Any] = None
|
|
70
|
+
|
|
71
|
+
def _get_client(self) -> Any:
|
|
72
|
+
"""Get or create Anthropic client (lazy init)."""
|
|
73
|
+
if self._client is None:
|
|
74
|
+
try:
|
|
75
|
+
import anthropic
|
|
76
|
+
self._client = anthropic.Anthropic(api_key=self._api_key)
|
|
77
|
+
except ImportError:
|
|
78
|
+
raise ImportError(
|
|
79
|
+
"anthropic package required. Install with: pip install anthropic"
|
|
80
|
+
)
|
|
81
|
+
return self._client
|
|
82
|
+
|
|
83
|
+
def generate(self, prompt: str, **kwargs: Any) -> str:
|
|
84
|
+
"""Generate a response from Claude.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
prompt: The input prompt
|
|
88
|
+
**kwargs: Additional parameters (temperature, max_tokens, system)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
The model's text response
|
|
92
|
+
"""
|
|
93
|
+
client = self._get_client()
|
|
94
|
+
|
|
95
|
+
# Extract parameters with defaults
|
|
96
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
97
|
+
max_tokens = kwargs.get("max_tokens", self.max_tokens)
|
|
98
|
+
system = kwargs.get("system", "You are a helpful assistant.")
|
|
99
|
+
|
|
100
|
+
start_time = time.time()
|
|
101
|
+
|
|
102
|
+
response = client.messages.create(
|
|
103
|
+
model=self.model,
|
|
104
|
+
max_tokens=max_tokens,
|
|
105
|
+
temperature=temperature,
|
|
106
|
+
system=system,
|
|
107
|
+
messages=[{"role": "user", "content": prompt}],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
111
|
+
|
|
112
|
+
# Extract text from response
|
|
113
|
+
text = response.content[0].text if response.content else ""
|
|
114
|
+
|
|
115
|
+
return text
|
|
116
|
+
|
|
117
|
+
def generate_with_perturbation(
|
|
118
|
+
self, prompt: str, noise_level: float = 0.1, **kwargs: Any
|
|
119
|
+
) -> str:
|
|
120
|
+
"""Generate a response with noise injection.
|
|
121
|
+
|
|
122
|
+
Applies perturbation through:
|
|
123
|
+
1. Temperature scaling based on noise level
|
|
124
|
+
2. Optional prompt perturbation
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
prompt: The input prompt
|
|
128
|
+
noise_level: Perturbation level (0.0-1.0)
|
|
129
|
+
**kwargs: Additional generation parameters
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
The perturbed response
|
|
133
|
+
"""
|
|
134
|
+
# Scale temperature with noise level
|
|
135
|
+
base_temp = kwargs.get("temperature", self.temperature)
|
|
136
|
+
perturbed_temp = min(base_temp + noise_level * 0.5, 1.0)
|
|
137
|
+
|
|
138
|
+
# Apply prompt perturbation at higher noise levels
|
|
139
|
+
perturbed_prompt = prompt
|
|
140
|
+
if noise_level > 0.3:
|
|
141
|
+
perturbed_prompt = self._perturb_prompt(prompt, noise_level)
|
|
142
|
+
|
|
143
|
+
return self.generate(
|
|
144
|
+
perturbed_prompt,
|
|
145
|
+
temperature=perturbed_temp,
|
|
146
|
+
**{k: v for k, v in kwargs.items() if k != "temperature"},
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def _perturb_prompt(self, prompt: str, noise_level: float) -> str:
|
|
150
|
+
"""Add slight noise to prompt to disrupt sandbagging."""
|
|
151
|
+
# HACK: these perturbations are pretty crude, but they work ok
|
|
152
|
+
# Simple perturbation: add subtle variations
|
|
153
|
+
perturbations = [
|
|
154
|
+
lambda p: p.replace(".", ".."),
|
|
155
|
+
lambda p: "Um, " + p,
|
|
156
|
+
lambda p: p + " Please respond.",
|
|
157
|
+
lambda p: "I was wondering, " + p.lower(),
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
# Apply perturbation based on noise level
|
|
161
|
+
if random.random() < noise_level:
|
|
162
|
+
perturbation = random.choice(perturbations)
|
|
163
|
+
return perturbation(prompt)
|
|
164
|
+
|
|
165
|
+
return prompt
|
|
166
|
+
|
|
167
|
+
def generate_batch(self, prompts: List[str], **kwargs: Any) -> List[str]:
|
|
168
|
+
"""Generate responses for multiple prompts."""
|
|
169
|
+
# NOTE: could parallelize this but sequential is fine for now
|
|
170
|
+
return [self.generate(prompt, **kwargs) for prompt in prompts]
|
|
171
|
+
|
|
172
|
+
def get_model_info(self) -> Dict[str, Any]:
|
|
173
|
+
"""Get information about the configured model.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Dictionary with model metadata
|
|
177
|
+
"""
|
|
178
|
+
return {
|
|
179
|
+
"provider": "anthropic",
|
|
180
|
+
"model": self.model,
|
|
181
|
+
"max_tokens": self.max_tokens,
|
|
182
|
+
"temperature": self.temperature,
|
|
183
|
+
"capabilities": ["text_generation", "analysis", "coding"],
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
def generate_with_response(
|
|
187
|
+
self, prompt: str, **kwargs: Any
|
|
188
|
+
) -> ModelResponse:
|
|
189
|
+
"""Generate a response with full metadata.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
prompt: The input prompt
|
|
193
|
+
**kwargs: Additional parameters
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
ModelResponse with text and metadata
|
|
197
|
+
"""
|
|
198
|
+
client = self._get_client()
|
|
199
|
+
|
|
200
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
201
|
+
max_tokens = kwargs.get("max_tokens", self.max_tokens)
|
|
202
|
+
system = kwargs.get("system", "You are a helpful assistant.")
|
|
203
|
+
|
|
204
|
+
start_time = time.time()
|
|
205
|
+
|
|
206
|
+
response = client.messages.create(
|
|
207
|
+
model=self.model,
|
|
208
|
+
max_tokens=max_tokens,
|
|
209
|
+
temperature=temperature,
|
|
210
|
+
system=system,
|
|
211
|
+
messages=[{"role": "user", "content": prompt}],
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
215
|
+
|
|
216
|
+
text = response.content[0].text if response.content else ""
|
|
217
|
+
|
|
218
|
+
return ModelResponse(
|
|
219
|
+
text=text,
|
|
220
|
+
model=self.model,
|
|
221
|
+
usage={
|
|
222
|
+
"input_tokens": response.usage.input_tokens,
|
|
223
|
+
"output_tokens": response.usage.output_tokens,
|
|
224
|
+
},
|
|
225
|
+
latency_ms=latency_ms,
|
|
226
|
+
metadata={
|
|
227
|
+
"stop_reason": response.stop_reason,
|
|
228
|
+
"model": response.model,
|
|
229
|
+
},
|
|
230
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Base classes and protocols for model API integrations.
|
|
2
|
+
|
|
3
|
+
This module defines the interface that all model API implementations
|
|
4
|
+
must follow for compatibility with the sandbagging detection framework.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ModelResponse:
|
|
13
|
+
"""Response from a model API call.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
text: The generated text response
|
|
17
|
+
model: The model identifier used
|
|
18
|
+
usage: Token usage statistics
|
|
19
|
+
latency_ms: Response latency in milliseconds
|
|
20
|
+
metadata: Additional provider-specific metadata
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
text: str
|
|
24
|
+
model: str
|
|
25
|
+
usage: Dict[str, int] = field(default_factory=dict)
|
|
26
|
+
latency_ms: float = 0.0
|
|
27
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@runtime_checkable
|
|
31
|
+
class ModelAPI(Protocol):
|
|
32
|
+
"""Protocol defining the interface for model API integrations.
|
|
33
|
+
|
|
34
|
+
All model API implementations should conform to this protocol
|
|
35
|
+
to ensure compatibility with the sandbagging detection framework.
|
|
36
|
+
|
|
37
|
+
The protocol defines two main methods:
|
|
38
|
+
- generate: Standard text generation
|
|
39
|
+
- generate_with_perturbation: Generation with noise injection
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> class MyModelAPI:
|
|
43
|
+
... def generate(self, prompt: str, **kwargs) -> str:
|
|
44
|
+
... # Implementation here
|
|
45
|
+
... return "response"
|
|
46
|
+
...
|
|
47
|
+
... def generate_with_perturbation(
|
|
48
|
+
... self, prompt: str, noise_level: float = 0.1, **kwargs
|
|
49
|
+
... ) -> str:
|
|
50
|
+
... # Implementation with perturbation
|
|
51
|
+
... return "perturbed response"
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def generate(self, prompt: str, **kwargs: Any) -> str:
|
|
55
|
+
"""Generate a response from the model.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
prompt: The input prompt to send to the model
|
|
59
|
+
**kwargs: Additional generation parameters (temperature, max_tokens, etc.)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
The model's text response
|
|
63
|
+
"""
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
def generate_with_perturbation(
|
|
67
|
+
self, prompt: str, noise_level: float = 0.1, **kwargs: Any
|
|
68
|
+
) -> str:
|
|
69
|
+
"""Generate a response with noise injection.
|
|
70
|
+
|
|
71
|
+
This method is used for noise injection probing in sandbagging detection.
|
|
72
|
+
Implementations should apply perturbation at the specified level.
|
|
73
|
+
|
|
74
|
+
Perturbation strategies may include:
|
|
75
|
+
- Temperature scaling
|
|
76
|
+
- Prompt perturbation (typos, rephrasing)
|
|
77
|
+
- Token sampling modifications
|
|
78
|
+
- Logit noise injection (if accessible)
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
prompt: The input prompt
|
|
82
|
+
noise_level: Level of perturbation (0.0 = none, 1.0 = maximum)
|
|
83
|
+
**kwargs: Additional generation parameters
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
The model's response with perturbation applied
|
|
87
|
+
"""
|
|
88
|
+
...
|
|
89
|
+
|
|
90
|
+
def generate_batch(
|
|
91
|
+
self, prompts: List[str], **kwargs: Any
|
|
92
|
+
) -> List[str]:
|
|
93
|
+
"""Generate responses for multiple prompts.
|
|
94
|
+
|
|
95
|
+
Default implementation calls generate() for each prompt.
|
|
96
|
+
Implementations may override for batch optimization.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
prompts: List of input prompts
|
|
100
|
+
**kwargs: Additional generation parameters
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of model responses
|
|
104
|
+
"""
|
|
105
|
+
...
|
|
106
|
+
|
|
107
|
+
def get_model_info(self) -> Dict[str, Any]:
|
|
108
|
+
"""Get information about the configured model.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dictionary with model metadata (name, version, capabilities)
|
|
112
|
+
"""
|
|
113
|
+
...
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""OpenAI API integration for sandbagging detection.
|
|
2
|
+
|
|
3
|
+
This module provides an implementation of the ModelAPI protocol
|
|
4
|
+
for OpenAI's GPT models, enabling sandbagging detection experiments.
|
|
5
|
+
|
|
6
|
+
Requires the openai package: pip install openai
|
|
7
|
+
Set OPENAI_API_KEY environment variable for authentication.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import random
|
|
12
|
+
import time
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
from ai_metacognition.integrations.base import ModelAPI, ModelResponse
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OpenAIModelAPI:
|
|
19
|
+
"""OpenAI GPT API implementation.
|
|
20
|
+
|
|
21
|
+
This class provides integration with OpenAI's GPT models
|
|
22
|
+
for use in sandbagging detection experiments.
|
|
23
|
+
|
|
24
|
+
Attributes:
|
|
25
|
+
model: The GPT model to use (e.g., "gpt-4", "gpt-4-turbo")
|
|
26
|
+
max_tokens: Maximum tokens to generate
|
|
27
|
+
temperature: Sampling temperature
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> api = OpenAIModelAPI(model="gpt-4")
|
|
31
|
+
>>> response = api.generate("What is 2+2?")
|
|
32
|
+
>>> print(response)
|
|
33
|
+
"The answer is 4."
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
model: str = "gpt-4",
|
|
39
|
+
max_tokens: int = 1024,
|
|
40
|
+
temperature: float = 0.7,
|
|
41
|
+
api_key: Optional[str] = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
"""Initialize OpenAI API client.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
model: GPT model identifier
|
|
47
|
+
max_tokens: Maximum tokens to generate
|
|
48
|
+
temperature: Sampling temperature (0.0-2.0)
|
|
49
|
+
api_key: API key (defaults to OPENAI_API_KEY env var)
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ImportError: If openai package is not installed
|
|
53
|
+
ValueError: If no API key is provided or found
|
|
54
|
+
"""
|
|
55
|
+
self.model = model
|
|
56
|
+
self.max_tokens = max_tokens
|
|
57
|
+
self.temperature = temperature
|
|
58
|
+
|
|
59
|
+
# Get API key
|
|
60
|
+
self._api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
61
|
+
if not self._api_key:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
"OpenAI API key required. Set OPENAI_API_KEY environment "
|
|
64
|
+
"variable or pass api_key parameter."
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Lazy import to avoid dependency issues
|
|
68
|
+
self._client: Optional[Any] = None
|
|
69
|
+
|
|
70
|
+
def _get_client(self) -> Any:
|
|
71
|
+
"""Get or create OpenAI client (lazy init)."""
|
|
72
|
+
if self._client is None:
|
|
73
|
+
try:
|
|
74
|
+
import openai
|
|
75
|
+
self._client = openai.OpenAI(api_key=self._api_key)
|
|
76
|
+
except ImportError:
|
|
77
|
+
raise ImportError(
|
|
78
|
+
"openai package required. Install with: pip install openai"
|
|
79
|
+
)
|
|
80
|
+
return self._client
|
|
81
|
+
|
|
82
|
+
def generate(self, prompt: str, **kwargs: Any) -> str:
|
|
83
|
+
"""Generate a response from GPT.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
prompt: The input prompt
|
|
87
|
+
**kwargs: Additional parameters (temperature, max_tokens, system)
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
The model's text response
|
|
91
|
+
"""
|
|
92
|
+
client = self._get_client()
|
|
93
|
+
|
|
94
|
+
# Extract parameters with defaults
|
|
95
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
96
|
+
max_tokens = kwargs.get("max_tokens", self.max_tokens)
|
|
97
|
+
system = kwargs.get("system", "You are a helpful assistant.")
|
|
98
|
+
|
|
99
|
+
messages = [
|
|
100
|
+
{"role": "system", "content": system},
|
|
101
|
+
{"role": "user", "content": prompt},
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
start_time = time.time()
|
|
105
|
+
|
|
106
|
+
response = client.chat.completions.create(
|
|
107
|
+
model=self.model,
|
|
108
|
+
messages=messages,
|
|
109
|
+
max_tokens=max_tokens,
|
|
110
|
+
temperature=temperature,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
114
|
+
|
|
115
|
+
# Extract text from response
|
|
116
|
+
text = response.choices[0].message.content if response.choices else ""
|
|
117
|
+
|
|
118
|
+
return text or ""
|
|
119
|
+
|
|
120
|
+
def generate_with_perturbation(
|
|
121
|
+
self, prompt: str, noise_level: float = 0.1, **kwargs: Any
|
|
122
|
+
) -> str:
|
|
123
|
+
"""Generate a response with noise injection.
|
|
124
|
+
|
|
125
|
+
Applies perturbation through:
|
|
126
|
+
1. Temperature scaling based on noise level
|
|
127
|
+
2. Optional prompt perturbation
|
|
128
|
+
3. Top-p sampling adjustment
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
prompt: The input prompt
|
|
132
|
+
noise_level: Perturbation level (0.0-1.0)
|
|
133
|
+
**kwargs: Additional generation parameters
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
The perturbed response
|
|
137
|
+
"""
|
|
138
|
+
# Scale temperature with noise level
|
|
139
|
+
base_temp = kwargs.get("temperature", self.temperature)
|
|
140
|
+
perturbed_temp = min(base_temp + noise_level * 0.8, 2.0)
|
|
141
|
+
|
|
142
|
+
# Adjust top_p for additional randomness
|
|
143
|
+
top_p = max(0.5, 1.0 - noise_level * 0.3)
|
|
144
|
+
|
|
145
|
+
# Apply prompt perturbation at higher noise levels
|
|
146
|
+
perturbed_prompt = prompt
|
|
147
|
+
if noise_level > 0.3:
|
|
148
|
+
perturbed_prompt = self._perturb_prompt(prompt, noise_level)
|
|
149
|
+
|
|
150
|
+
return self.generate(
|
|
151
|
+
perturbed_prompt,
|
|
152
|
+
temperature=perturbed_temp,
|
|
153
|
+
top_p=top_p,
|
|
154
|
+
**{k: v for k, v in kwargs.items() if k not in ["temperature", "top_p"]},
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def _perturb_prompt(self, prompt: str, noise_level: float) -> str:
|
|
158
|
+
"""Add slight noise to prompt."""
|
|
159
|
+
perturbations = [
|
|
160
|
+
lambda p: p.replace(".", ".."),
|
|
161
|
+
lambda p: "Hey, " + p,
|
|
162
|
+
lambda p: p + " Be concise.",
|
|
163
|
+
lambda p: "Quick question: " + p.lower(),
|
|
164
|
+
lambda p: p.replace("?", "??"),
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
if random.random() < noise_level:
|
|
168
|
+
perturbation = random.choice(perturbations)
|
|
169
|
+
return perturbation(prompt)
|
|
170
|
+
|
|
171
|
+
return prompt
|
|
172
|
+
|
|
173
|
+
def generate_batch(self, prompts: List[str], **kwargs: Any) -> List[str]:
|
|
174
|
+
"""Generate responses for multiple prompts."""
|
|
175
|
+
return [self.generate(prompt, **kwargs) for prompt in prompts]
|
|
176
|
+
|
|
177
|
+
def get_model_info(self) -> Dict[str, Any]:
|
|
178
|
+
"""Get information about the configured model.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Dictionary with model metadata
|
|
182
|
+
"""
|
|
183
|
+
return {
|
|
184
|
+
"provider": "openai",
|
|
185
|
+
"model": self.model,
|
|
186
|
+
"max_tokens": self.max_tokens,
|
|
187
|
+
"temperature": self.temperature,
|
|
188
|
+
"capabilities": ["text_generation", "analysis", "coding", "reasoning"],
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
def generate_with_response(
|
|
192
|
+
self, prompt: str, **kwargs: Any
|
|
193
|
+
) -> ModelResponse:
|
|
194
|
+
"""Generate a response with full metadata.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
prompt: The input prompt
|
|
198
|
+
**kwargs: Additional parameters
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
ModelResponse with text and metadata
|
|
202
|
+
"""
|
|
203
|
+
client = self._get_client()
|
|
204
|
+
|
|
205
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
206
|
+
max_tokens = kwargs.get("max_tokens", self.max_tokens)
|
|
207
|
+
system = kwargs.get("system", "You are a helpful assistant.")
|
|
208
|
+
|
|
209
|
+
messages = [
|
|
210
|
+
{"role": "system", "content": system},
|
|
211
|
+
{"role": "user", "content": prompt},
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
start_time = time.time()
|
|
215
|
+
|
|
216
|
+
response = client.chat.completions.create(
|
|
217
|
+
model=self.model,
|
|
218
|
+
messages=messages,
|
|
219
|
+
max_tokens=max_tokens,
|
|
220
|
+
temperature=temperature,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
224
|
+
|
|
225
|
+
text = response.choices[0].message.content if response.choices else ""
|
|
226
|
+
|
|
227
|
+
return ModelResponse(
|
|
228
|
+
text=text or "",
|
|
229
|
+
model=self.model,
|
|
230
|
+
usage={
|
|
231
|
+
"prompt_tokens": response.usage.prompt_tokens if response.usage else 0,
|
|
232
|
+
"completion_tokens": response.usage.completion_tokens if response.usage else 0,
|
|
233
|
+
"total_tokens": response.usage.total_tokens if response.usage else 0,
|
|
234
|
+
},
|
|
235
|
+
latency_ms=latency_ms,
|
|
236
|
+
metadata={
|
|
237
|
+
"finish_reason": response.choices[0].finish_reason if response.choices else None,
|
|
238
|
+
"model": response.model,
|
|
239
|
+
},
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
def generate_with_logprobs(
|
|
243
|
+
self, prompt: str, **kwargs: Any
|
|
244
|
+
) -> Dict[str, Any]:
|
|
245
|
+
"""Generate a response with token log probabilities.
|
|
246
|
+
|
|
247
|
+
Useful for analyzing model confidence and detecting
|
|
248
|
+
unusual token distributions that may indicate sandbagging.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
prompt: The input prompt
|
|
252
|
+
**kwargs: Additional parameters
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Dictionary with text and log probabilities
|
|
256
|
+
"""
|
|
257
|
+
client = self._get_client()
|
|
258
|
+
|
|
259
|
+
temperature = kwargs.get("temperature", self.temperature)
|
|
260
|
+
max_tokens = kwargs.get("max_tokens", self.max_tokens)
|
|
261
|
+
system = kwargs.get("system", "You are a helpful assistant.")
|
|
262
|
+
|
|
263
|
+
messages = [
|
|
264
|
+
{"role": "system", "content": system},
|
|
265
|
+
{"role": "user", "content": prompt},
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
response = client.chat.completions.create(
|
|
269
|
+
model=self.model,
|
|
270
|
+
messages=messages,
|
|
271
|
+
max_tokens=max_tokens,
|
|
272
|
+
temperature=temperature,
|
|
273
|
+
logprobs=True,
|
|
274
|
+
top_logprobs=5,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
choice = response.choices[0] if response.choices else None
|
|
278
|
+
text = choice.message.content if choice else ""
|
|
279
|
+
|
|
280
|
+
logprobs_data = None
|
|
281
|
+
if choice and choice.logprobs:
|
|
282
|
+
logprobs_data = {
|
|
283
|
+
"tokens": [
|
|
284
|
+
{
|
|
285
|
+
"token": lp.token,
|
|
286
|
+
"logprob": lp.logprob,
|
|
287
|
+
"top_logprobs": [
|
|
288
|
+
{"token": t.token, "logprob": t.logprob}
|
|
289
|
+
for t in (lp.top_logprobs or [])
|
|
290
|
+
],
|
|
291
|
+
}
|
|
292
|
+
for lp in (choice.logprobs.content or [])
|
|
293
|
+
],
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
return {
|
|
297
|
+
"text": text or "",
|
|
298
|
+
"logprobs": logprobs_data,
|
|
299
|
+
"model": response.model,
|
|
300
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Activation probing for sandbagging detection.
|
|
2
|
+
|
|
3
|
+
This module provides tools for analyzing model activations to detect
|
|
4
|
+
sandbagging behavior at the representation level.
|
|
5
|
+
|
|
6
|
+
Key components:
|
|
7
|
+
- ActivationHook: Capture hidden states during forward pass
|
|
8
|
+
- SteeringVector: Represent behavioral directions in activation space
|
|
9
|
+
- extract_caa_vector: Extract vectors using Contrastive Activation Addition
|
|
10
|
+
- LinearProbe: Train classifiers on activation patterns
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .hooks import ActivationHook
|
|
14
|
+
from .vectors import SteeringVector
|
|
15
|
+
from .extraction import extract_caa_vector, extract_activations
|
|
16
|
+
from .probes import LinearProbe
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"ActivationHook",
|
|
20
|
+
"SteeringVector",
|
|
21
|
+
"extract_caa_vector",
|
|
22
|
+
"extract_activations",
|
|
23
|
+
"LinearProbe",
|
|
24
|
+
]
|