wisent 0.5.13__py3-none-any.whl → 0.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -1
- wisent/cli.py +114 -0
- wisent/core/activations/activations_collector.py +19 -11
- wisent/core/agent/__init__.py +1 -18
- wisent/core/agent/diagnose/__init__.py +1 -55
- wisent/core/cli/__init__.py +3 -1
- wisent/core/cli/create_steering_vector.py +60 -18
- wisent/core/cli/evaluate_responses.py +14 -8
- wisent/core/cli/generate_pairs_from_task.py +18 -5
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/multi_steer.py +108 -0
- wisent/core/cli/optimize_classification.py +187 -285
- wisent/core/cli/optimize_sample_size.py +78 -0
- wisent/core/cli/optimize_steering.py +354 -53
- wisent/core/cli/tasks.py +274 -9
- wisent/core/errors/__init__.py +0 -0
- wisent/core/errors/error_handler.py +134 -0
- wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +152 -295
- wisent/core/evaluators/rotator.py +22 -8
- wisent/core/main.py +5 -1
- wisent/core/model_persistence.py +4 -19
- wisent/core/models/wisent_model.py +11 -3
- wisent/core/parser.py +4 -3
- wisent/core/parser_arguments/main_parser.py +1 -1
- wisent/core/parser_arguments/multi_steer_parser.py +4 -3
- wisent/core/parser_arguments/optimize_steering_parser.py +4 -0
- wisent/core/sample_size_optimizer_v2.py +1 -1
- wisent/core/steering_optimizer.py +2 -2
- wisent/tests/__init__.py +0 -0
- wisent/tests/examples/__init__.py +0 -0
- wisent/tests/examples/cli/__init__.py +0 -0
- wisent/tests/examples/cli/activations/__init__.py +0 -0
- wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
- wisent/tests/examples/cli/classifier/__init__.py +0 -0
- wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
- wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
- wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
- wisent/tests/examples/cli/evaluation/__init__.py +0 -0
- wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
- wisent/tests/examples/cli/generate/__init__.py +0 -0
- wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
- wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
- wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
- wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
- wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
- wisent/tests/examples/cli/optimizer/__init__.py +0 -0
- wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
- wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
- wisent/tests/examples/cli/steering/__init__.py +0 -0
- wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
- wisent/tests/examples/cli/synthetic/__init__.py +0 -0
- wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
- {wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/METADATA +3 -1
- {wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/RECORD +61 -31
- wisent/core/agent/diagnose/test_synthetic_classifier.py +0 -71
- /wisent/core/parser_arguments/{test_nonsense_parser.py → nonsense_parser.py} +0 -0
- {wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/WHEEL +0 -0
- {wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/entry_points.txt +0 -0
- {wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test for multi_steer_with_different_parameters.sh example.
|
|
3
|
+
|
|
4
|
+
This test validates different parameter combinations for multi-steering:
|
|
5
|
+
normalized vs unnormalized weights, target norm scaling, and different methods.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import subprocess
|
|
9
|
+
import pytest
|
|
10
|
+
import tempfile
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def setup_test_vectors(tmpdir):
|
|
15
|
+
"""Helper function to create test vectors for multi-steering tests."""
|
|
16
|
+
vector1_path = os.path.join(tmpdir, "vector1.pt")
|
|
17
|
+
vector2_path = os.path.join(tmpdir, "vector2.pt")
|
|
18
|
+
|
|
19
|
+
# Train first vector
|
|
20
|
+
result1 = subprocess.run(
|
|
21
|
+
[
|
|
22
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-task",
|
|
23
|
+
"--task", "boolq",
|
|
24
|
+
"--trait-label", "test1",
|
|
25
|
+
"--output", vector1_path,
|
|
26
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
27
|
+
"--num-pairs", "10",
|
|
28
|
+
"--layers", "3",
|
|
29
|
+
"--token-aggregation", "average",
|
|
30
|
+
"--method", "caa",
|
|
31
|
+
"--normalize",
|
|
32
|
+
"--device", "cpu"
|
|
33
|
+
],
|
|
34
|
+
capture_output=True,
|
|
35
|
+
text=True,
|
|
36
|
+
timeout=300
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
assert result1.returncode == 0, f"Failed to create vector1: {result1.stderr}"
|
|
40
|
+
|
|
41
|
+
# Train second vector
|
|
42
|
+
result2 = subprocess.run(
|
|
43
|
+
[
|
|
44
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-task",
|
|
45
|
+
"--task", "boolq",
|
|
46
|
+
"--trait-label", "test2",
|
|
47
|
+
"--output", vector2_path,
|
|
48
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
49
|
+
"--num-pairs", "10",
|
|
50
|
+
"--layers", "3",
|
|
51
|
+
"--token-aggregation", "average",
|
|
52
|
+
"--method", "caa",
|
|
53
|
+
"--normalize",
|
|
54
|
+
"--device", "cpu"
|
|
55
|
+
],
|
|
56
|
+
capture_output=True,
|
|
57
|
+
text=True,
|
|
58
|
+
timeout=300
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
assert result2.returncode == 0, f"Failed to create vector2: {result2.stderr}"
|
|
62
|
+
|
|
63
|
+
return vector1_path, vector2_path
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_normalized_weights():
|
|
67
|
+
"""Test multi-steer with normalized weights (sum to 1.0)."""
|
|
68
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
69
|
+
vector1_path, vector2_path = setup_test_vectors(tmpdir)
|
|
70
|
+
|
|
71
|
+
result = subprocess.run(
|
|
72
|
+
[
|
|
73
|
+
"python", "-m", "wisent.core.main", "multi-steer",
|
|
74
|
+
"--vector", f"{vector1_path}:0.6",
|
|
75
|
+
"--vector", f"{vector2_path}:0.4",
|
|
76
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
77
|
+
"--layer", "3",
|
|
78
|
+
"--method", "CAA",
|
|
79
|
+
"--prompt", "Explain AI.",
|
|
80
|
+
"--max-new-tokens", "50",
|
|
81
|
+
"--normalize-weights",
|
|
82
|
+
"--device", "cpu",
|
|
83
|
+
"--verbose"
|
|
84
|
+
],
|
|
85
|
+
capture_output=True,
|
|
86
|
+
text=True,
|
|
87
|
+
timeout=180
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_unnormalized_weights():
|
|
94
|
+
"""Test multi-steer with unnormalized weights for stronger effect."""
|
|
95
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
96
|
+
vector1_path, vector2_path = setup_test_vectors(tmpdir)
|
|
97
|
+
|
|
98
|
+
result = subprocess.run(
|
|
99
|
+
[
|
|
100
|
+
"python", "-m", "wisent.core.main", "multi-steer",
|
|
101
|
+
"--vector", f"{vector1_path}:2.0",
|
|
102
|
+
"--vector", f"{vector2_path}:1.5",
|
|
103
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
104
|
+
"--layer", "3",
|
|
105
|
+
"--method", "CAA",
|
|
106
|
+
"--prompt", "Explain AI.",
|
|
107
|
+
"--max-new-tokens", "50",
|
|
108
|
+
"--allow-unnormalized",
|
|
109
|
+
"--device", "cpu",
|
|
110
|
+
"--verbose"
|
|
111
|
+
],
|
|
112
|
+
capture_output=True,
|
|
113
|
+
text=True,
|
|
114
|
+
timeout=180
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_target_norm_scaling():
|
|
121
|
+
"""Test multi-steer with target norm scaling."""
|
|
122
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
123
|
+
vector1_path, vector2_path = setup_test_vectors(tmpdir)
|
|
124
|
+
|
|
125
|
+
result = subprocess.run(
|
|
126
|
+
[
|
|
127
|
+
"python", "-m", "wisent.core.main", "multi-steer",
|
|
128
|
+
"--vector", f"{vector1_path}:0.5",
|
|
129
|
+
"--vector", f"{vector2_path}:0.5",
|
|
130
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
131
|
+
"--layer", "3",
|
|
132
|
+
"--method", "CAA",
|
|
133
|
+
"--prompt", "Explain AI.",
|
|
134
|
+
"--max-new-tokens", "50",
|
|
135
|
+
"--target-norm", "10.0",
|
|
136
|
+
"--device", "cpu",
|
|
137
|
+
"--verbose"
|
|
138
|
+
],
|
|
139
|
+
capture_output=True,
|
|
140
|
+
text=True,
|
|
141
|
+
timeout=180
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_subtle_steering_with_small_norm():
|
|
148
|
+
"""Test subtle steering with small target norm."""
|
|
149
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
150
|
+
vector1_path, vector2_path = setup_test_vectors(tmpdir)
|
|
151
|
+
|
|
152
|
+
result = subprocess.run(
|
|
153
|
+
[
|
|
154
|
+
"python", "-m", "wisent.core.main", "multi-steer",
|
|
155
|
+
"--vector", f"{vector1_path}:0.5",
|
|
156
|
+
"--vector", f"{vector2_path}:0.5",
|
|
157
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
158
|
+
"--layer", "3",
|
|
159
|
+
"--method", "CAA",
|
|
160
|
+
"--prompt", "Explain AI.",
|
|
161
|
+
"--max-new-tokens", "50",
|
|
162
|
+
"--target-norm", "2.0",
|
|
163
|
+
"--device", "cpu",
|
|
164
|
+
"--verbose"
|
|
165
|
+
],
|
|
166
|
+
capture_output=True,
|
|
167
|
+
text=True,
|
|
168
|
+
timeout=180
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def test_save_combined_vector():
|
|
175
|
+
"""Test saving combined vector for reuse."""
|
|
176
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
177
|
+
vector1_path, vector2_path = setup_test_vectors(tmpdir)
|
|
178
|
+
combined_path = os.path.join(tmpdir, "combined.pt")
|
|
179
|
+
|
|
180
|
+
result = subprocess.run(
|
|
181
|
+
[
|
|
182
|
+
"python", "-m", "wisent.core.main", "multi-steer",
|
|
183
|
+
"--vector", f"{vector1_path}:0.5",
|
|
184
|
+
"--vector", f"{vector2_path}:0.5",
|
|
185
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
186
|
+
"--layer", "3",
|
|
187
|
+
"--method", "CAA",
|
|
188
|
+
"--prompt", "Explain AI.",
|
|
189
|
+
"--max-new-tokens", "50",
|
|
190
|
+
"--normalize-weights",
|
|
191
|
+
"--save-combined", combined_path,
|
|
192
|
+
"--device", "cpu",
|
|
193
|
+
"--verbose"
|
|
194
|
+
],
|
|
195
|
+
capture_output=True,
|
|
196
|
+
text=True,
|
|
197
|
+
timeout=180
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
201
|
+
assert os.path.exists(combined_path), "Combined vector was not saved"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
if __name__ == "__main__":
|
|
205
|
+
pytest.main([__file__, "-v"])
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test for train_and_multi_steer.sh example.
|
|
3
|
+
|
|
4
|
+
This test validates training multiple steering vectors and combining them
|
|
5
|
+
with different weights during inference using the multi-steer command.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import subprocess
|
|
9
|
+
import pytest
|
|
10
|
+
import tempfile
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_train_multiple_vectors_and_combine():
|
|
15
|
+
"""Test training multiple vectors and combining them with multi-steer."""
|
|
16
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
17
|
+
vector1_path = os.path.join(tmpdir, "vector1.pt")
|
|
18
|
+
vector2_path = os.path.join(tmpdir, "vector2.pt")
|
|
19
|
+
combined_path = os.path.join(tmpdir, "combined.pt")
|
|
20
|
+
|
|
21
|
+
# Train first vector
|
|
22
|
+
train1_result = subprocess.run(
|
|
23
|
+
[
|
|
24
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-task",
|
|
25
|
+
"--task", "boolq",
|
|
26
|
+
"--trait-label", "truthfulness",
|
|
27
|
+
"--output", vector1_path,
|
|
28
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
29
|
+
"--num-pairs", "10",
|
|
30
|
+
"--layers", "3",
|
|
31
|
+
"--token-aggregation", "average",
|
|
32
|
+
"--method", "caa",
|
|
33
|
+
"--normalize",
|
|
34
|
+
"--device", "cpu",
|
|
35
|
+
"--verbose"
|
|
36
|
+
],
|
|
37
|
+
capture_output=True,
|
|
38
|
+
text=True,
|
|
39
|
+
timeout=300
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
assert train1_result.returncode == 0, f"Training vector1 failed: {train1_result.stderr}"
|
|
43
|
+
assert os.path.exists(vector1_path), "Vector1 was not saved"
|
|
44
|
+
|
|
45
|
+
# Train second vector
|
|
46
|
+
train2_result = subprocess.run(
|
|
47
|
+
[
|
|
48
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-task",
|
|
49
|
+
"--task", "boolq",
|
|
50
|
+
"--trait-label", "helpfulness",
|
|
51
|
+
"--output", vector2_path,
|
|
52
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
53
|
+
"--num-pairs", "10",
|
|
54
|
+
"--layers", "3",
|
|
55
|
+
"--token-aggregation", "average",
|
|
56
|
+
"--method", "caa",
|
|
57
|
+
"--normalize",
|
|
58
|
+
"--device", "cpu",
|
|
59
|
+
"--verbose"
|
|
60
|
+
],
|
|
61
|
+
capture_output=True,
|
|
62
|
+
text=True,
|
|
63
|
+
timeout=300
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert train2_result.returncode == 0, f"Training vector2 failed: {train2_result.stderr}"
|
|
67
|
+
assert os.path.exists(vector2_path), "Vector2 was not saved"
|
|
68
|
+
|
|
69
|
+
# Combine vectors with equal weights
|
|
70
|
+
combine_result = subprocess.run(
|
|
71
|
+
[
|
|
72
|
+
"python", "-m", "wisent.core.main", "multi-steer",
|
|
73
|
+
"--vector", f"{vector1_path}:0.5",
|
|
74
|
+
"--vector", f"{vector2_path}:0.5",
|
|
75
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
76
|
+
"--layer", "3",
|
|
77
|
+
"--method", "CAA",
|
|
78
|
+
"--prompt", "What is AI?",
|
|
79
|
+
"--max-new-tokens", "50",
|
|
80
|
+
"--normalize-weights",
|
|
81
|
+
"--save-combined", combined_path,
|
|
82
|
+
"--device", "cpu",
|
|
83
|
+
"--verbose"
|
|
84
|
+
],
|
|
85
|
+
capture_output=True,
|
|
86
|
+
text=True,
|
|
87
|
+
timeout=180
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Should complete without error
|
|
91
|
+
assert combine_result.returncode == 0, f"Combining failed: {combine_result.stderr}"
|
|
92
|
+
|
|
93
|
+
# Combined vector should be saved
|
|
94
|
+
assert os.path.exists(combined_path), "Combined vector was not saved"
|
|
95
|
+
|
|
96
|
+
# Output should contain generated text
|
|
97
|
+
assert len(combine_result.stdout) > 0, "No output generated"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_multi_steer_with_different_emphasis():
|
|
101
|
+
"""Test combining vectors with different weight emphasis."""
|
|
102
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
103
|
+
vector1_path = os.path.join(tmpdir, "vector1.pt")
|
|
104
|
+
vector2_path = os.path.join(tmpdir, "vector2.pt")
|
|
105
|
+
|
|
106
|
+
# Train vectors (abbreviated for test)
|
|
107
|
+
train1_result = subprocess.run(
|
|
108
|
+
[
|
|
109
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-task",
|
|
110
|
+
"--task", "boolq",
|
|
111
|
+
"--trait-label", "trait1",
|
|
112
|
+
"--output", vector1_path,
|
|
113
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
114
|
+
"--num-pairs", "10",
|
|
115
|
+
"--layers", "3",
|
|
116
|
+
"--token-aggregation", "average",
|
|
117
|
+
"--method", "caa",
|
|
118
|
+
"--normalize",
|
|
119
|
+
"--device", "cpu"
|
|
120
|
+
],
|
|
121
|
+
capture_output=True,
|
|
122
|
+
text=True,
|
|
123
|
+
timeout=300
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
assert train1_result.returncode == 0
|
|
127
|
+
|
|
128
|
+
train2_result = subprocess.run(
|
|
129
|
+
[
|
|
130
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-task",
|
|
131
|
+
"--task", "boolq",
|
|
132
|
+
"--trait-label", "trait2",
|
|
133
|
+
"--output", vector2_path,
|
|
134
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
135
|
+
"--num-pairs", "10",
|
|
136
|
+
"--layers", "3",
|
|
137
|
+
"--token-aggregation", "average",
|
|
138
|
+
"--method", "caa",
|
|
139
|
+
"--normalize",
|
|
140
|
+
"--device", "cpu"
|
|
141
|
+
],
|
|
142
|
+
capture_output=True,
|
|
143
|
+
text=True,
|
|
144
|
+
timeout=300
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
assert train2_result.returncode == 0
|
|
148
|
+
|
|
149
|
+
# Emphasize first vector (70% vs 30%)
|
|
150
|
+
combine_result = subprocess.run(
|
|
151
|
+
[
|
|
152
|
+
"python", "-m", "wisent.core.main", "multi-steer",
|
|
153
|
+
"--vector", f"{vector1_path}:0.7",
|
|
154
|
+
"--vector", f"{vector2_path}:0.3",
|
|
155
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
156
|
+
"--layer", "3",
|
|
157
|
+
"--method", "CAA",
|
|
158
|
+
"--prompt", "Explain quantum computing.",
|
|
159
|
+
"--max-new-tokens", "50",
|
|
160
|
+
"--normalize-weights",
|
|
161
|
+
"--device", "cpu",
|
|
162
|
+
"--verbose"
|
|
163
|
+
],
|
|
164
|
+
capture_output=True,
|
|
165
|
+
text=True,
|
|
166
|
+
timeout=180
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Should complete without error
|
|
170
|
+
assert combine_result.returncode == 0, f"Combining failed: {combine_result.stderr}"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
if __name__ == "__main__":
|
|
174
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test for optimize_sample_size.sh example.
|
|
3
|
+
|
|
4
|
+
This test validates that the optimize-sample-size command works correctly
|
|
5
|
+
with minimal parameters to verify the example script would work.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import subprocess
|
|
9
|
+
import pytest
|
|
10
|
+
import tempfile
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_optimize_sample_size_basic():
|
|
15
|
+
"""Test basic sample size optimization command."""
|
|
16
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
17
|
+
result = subprocess.run(
|
|
18
|
+
[
|
|
19
|
+
"python", "-m", "wisent.core.main", "optimize-sample-size",
|
|
20
|
+
"meta-llama/Llama-3.2-1B-Instruct", # Small model for testing
|
|
21
|
+
"--task", "boolq",
|
|
22
|
+
"--layer", "3",
|
|
23
|
+
"--token-aggregation", "average",
|
|
24
|
+
"--sample-sizes", "5", "10",
|
|
25
|
+
"--test-size", "20",
|
|
26
|
+
"--limit", "30",
|
|
27
|
+
"--device", "cpu",
|
|
28
|
+
"--verbose"
|
|
29
|
+
],
|
|
30
|
+
capture_output=True,
|
|
31
|
+
text=True,
|
|
32
|
+
timeout=300
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Should complete without error
|
|
36
|
+
assert result.returncode == 0, f"Command failed with: {result.stderr}"
|
|
37
|
+
|
|
38
|
+
# Should mention sample sizes in output
|
|
39
|
+
assert "5" in result.stdout or "5" in result.stderr
|
|
40
|
+
assert "10" in result.stdout or "10" in result.stderr
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_optimize_sample_size_with_steering():
|
|
44
|
+
"""Test sample size optimization in steering mode."""
|
|
45
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
46
|
+
result = subprocess.run(
|
|
47
|
+
[
|
|
48
|
+
"python", "-m", "wisent.core.main", "optimize-sample-size",
|
|
49
|
+
"meta-llama/Llama-3.2-1B-Instruct",
|
|
50
|
+
"--task", "boolq",
|
|
51
|
+
"--layer", "3",
|
|
52
|
+
"--token-aggregation", "final",
|
|
53
|
+
"--steering-mode",
|
|
54
|
+
"--steering-method", "CAA",
|
|
55
|
+
"--steering-strength", "1.0",
|
|
56
|
+
"--sample-sizes", "5", "10",
|
|
57
|
+
"--test-size", "15",
|
|
58
|
+
"--limit", "25",
|
|
59
|
+
"--device", "cpu",
|
|
60
|
+
"--verbose"
|
|
61
|
+
],
|
|
62
|
+
capture_output=True,
|
|
63
|
+
text=True,
|
|
64
|
+
timeout=300
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Should complete without error
|
|
68
|
+
assert result.returncode == 0, f"Command failed with: {result.stderr}"
|
|
69
|
+
|
|
70
|
+
# Should mention steering in output
|
|
71
|
+
assert "steering" in result.stdout.lower() or "steering" in result.stderr.lower()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_optimize_sample_size_custom_threshold():
|
|
75
|
+
"""Test sample size optimization with custom threshold."""
|
|
76
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
77
|
+
result = subprocess.run(
|
|
78
|
+
[
|
|
79
|
+
"python", "-m", "wisent.core.main", "optimize-sample-size",
|
|
80
|
+
"meta-llama/Llama-3.2-1B-Instruct",
|
|
81
|
+
"--task", "boolq",
|
|
82
|
+
"--layer", "3",
|
|
83
|
+
"--token-aggregation", "max",
|
|
84
|
+
"--threshold", "0.6",
|
|
85
|
+
"--sample-sizes", "5", "10",
|
|
86
|
+
"--test-size", "15",
|
|
87
|
+
"--limit", "25",
|
|
88
|
+
"--device", "cpu",
|
|
89
|
+
"--seed", "123",
|
|
90
|
+
"--verbose"
|
|
91
|
+
],
|
|
92
|
+
capture_output=True,
|
|
93
|
+
text=True,
|
|
94
|
+
timeout=300
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Should complete without error
|
|
98
|
+
assert result.returncode == 0, f"Command failed with: {result.stderr}"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
pytest.main([__file__, "-v"])
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for optimizer examples (classification and steering).
|
|
3
|
+
|
|
4
|
+
Validates parameter optimization commands.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import pytest
|
|
9
|
+
import tempfile
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_optimize_classification_parameters():
|
|
14
|
+
"""Test optimizing classification parameters."""
|
|
15
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
16
|
+
result = subprocess.run(
|
|
17
|
+
[
|
|
18
|
+
"python", "-m", "wisent.core.main", "optimize-classification",
|
|
19
|
+
"meta-llama/Llama-3.2-1B-Instruct",
|
|
20
|
+
"--limit", "10",
|
|
21
|
+
"--optimization-metric", "f1",
|
|
22
|
+
"--max-time-per-task", "5.0",
|
|
23
|
+
"--layer-range", "2-4",
|
|
24
|
+
"--aggregation-methods", "average", "final",
|
|
25
|
+
"--threshold-range", "0.5", "0.6",
|
|
26
|
+
"--device", "cpu"
|
|
27
|
+
],
|
|
28
|
+
capture_output=True,
|
|
29
|
+
text=True,
|
|
30
|
+
timeout=300
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_optimize_steering_parameters():
|
|
37
|
+
"""Test optimizing steering parameters."""
|
|
38
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
39
|
+
result = subprocess.run(
|
|
40
|
+
[
|
|
41
|
+
"python", "-m", "wisent.core.main", "optimize-steering", "comprehensive",
|
|
42
|
+
"meta-llama/Llama-3.2-1B-Instruct",
|
|
43
|
+
"--tasks", "boolq",
|
|
44
|
+
"--methods", "CAA",
|
|
45
|
+
"--limit", "10",
|
|
46
|
+
"--max-time-per-task", "5.0",
|
|
47
|
+
"--device", "cpu",
|
|
48
|
+
"--verbose"
|
|
49
|
+
],
|
|
50
|
+
capture_output=True,
|
|
51
|
+
text=True,
|
|
52
|
+
timeout=300
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
if __name__ == "__main__":
|
|
59
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for steering vector creation examples.
|
|
3
|
+
|
|
4
|
+
Validates creating steering vectors from tasks, synthetic pairs, and activations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
import pytest
|
|
9
|
+
import tempfile
|
|
10
|
+
import os
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_create_vector_from_task():
|
|
15
|
+
"""Test creating steering vector from lm-eval task."""
|
|
16
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
17
|
+
output_file = os.path.join(tmpdir, "vector.pt")
|
|
18
|
+
|
|
19
|
+
result = subprocess.run(
|
|
20
|
+
[
|
|
21
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-task",
|
|
22
|
+
"--task", "boolq",
|
|
23
|
+
"--trait-label", "correctness",
|
|
24
|
+
"--output", output_file,
|
|
25
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
26
|
+
"--num-pairs", "10",
|
|
27
|
+
"--layers", "3",
|
|
28
|
+
"--token-aggregation", "average",
|
|
29
|
+
"--method", "caa",
|
|
30
|
+
"--normalize",
|
|
31
|
+
"--device", "cpu",
|
|
32
|
+
"--verbose"
|
|
33
|
+
],
|
|
34
|
+
capture_output=True,
|
|
35
|
+
text=True,
|
|
36
|
+
timeout=300
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
40
|
+
assert os.path.exists(output_file), "Vector file not created"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_create_vector_from_synthetic():
|
|
44
|
+
"""Test creating steering vector from synthetic pairs."""
|
|
45
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
46
|
+
output_file = os.path.join(tmpdir, "synthetic_vector.pt")
|
|
47
|
+
|
|
48
|
+
result = subprocess.run(
|
|
49
|
+
[
|
|
50
|
+
"python", "-m", "wisent.core.main", "generate-vector-from-synthetic",
|
|
51
|
+
"--trait", "helpfulness",
|
|
52
|
+
"--output", output_file,
|
|
53
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
54
|
+
"--num-pairs", "5",
|
|
55
|
+
"--layers", "3",
|
|
56
|
+
"--token-aggregation", "average",
|
|
57
|
+
"--method", "caa",
|
|
58
|
+
"--normalize",
|
|
59
|
+
"--device", "cpu",
|
|
60
|
+
"--verbose"
|
|
61
|
+
],
|
|
62
|
+
capture_output=True,
|
|
63
|
+
text=True,
|
|
64
|
+
timeout=300
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
assert result.returncode == 0, f"Command failed: {result.stderr}"
|
|
68
|
+
assert os.path.exists(output_file), "Vector file not created"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_create_vector_from_activations():
|
|
72
|
+
"""Test creating steering vector from enriched pairs."""
|
|
73
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
74
|
+
# First create pairs with activations
|
|
75
|
+
pairs_file = os.path.join(tmpdir, "pairs.json")
|
|
76
|
+
enriched_file = os.path.join(tmpdir, "enriched_pairs.json")
|
|
77
|
+
vector_file = os.path.join(tmpdir, "vector.pt")
|
|
78
|
+
|
|
79
|
+
# Create test pairs
|
|
80
|
+
pairs = [
|
|
81
|
+
{
|
|
82
|
+
"prompt": "How should I respond?",
|
|
83
|
+
"positive_response": {"model_response": "Good answer."},
|
|
84
|
+
"negative_response": {"model_response": "Bad answer."},
|
|
85
|
+
"label": "quality"
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"prompt": "What information is best?",
|
|
89
|
+
"positive_response": {"model_response": "Helpful info."},
|
|
90
|
+
"negative_response": {"model_response": "Unhelpful info."},
|
|
91
|
+
"label": "quality"
|
|
92
|
+
}
|
|
93
|
+
]
|
|
94
|
+
with open(pairs_file, 'w') as f:
|
|
95
|
+
json.dump(pairs, f)
|
|
96
|
+
|
|
97
|
+
# Enrich with activations
|
|
98
|
+
enrich_result = subprocess.run(
|
|
99
|
+
[
|
|
100
|
+
"python", "-m", "wisent.core.main", "get-activations",
|
|
101
|
+
pairs_file,
|
|
102
|
+
"--output", enriched_file,
|
|
103
|
+
"--model", "meta-llama/Llama-3.2-1B-Instruct",
|
|
104
|
+
"--layers", "3",
|
|
105
|
+
"--token-aggregation", "average",
|
|
106
|
+
"--device", "cpu"
|
|
107
|
+
],
|
|
108
|
+
capture_output=True,
|
|
109
|
+
text=True,
|
|
110
|
+
timeout=300
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
assert enrich_result.returncode == 0
|
|
114
|
+
|
|
115
|
+
# Create steering vector from enriched pairs
|
|
116
|
+
vector_result = subprocess.run(
|
|
117
|
+
[
|
|
118
|
+
"python", "-m", "wisent.core.main", "create-steering-vector",
|
|
119
|
+
enriched_file,
|
|
120
|
+
"--output", vector_file,
|
|
121
|
+
"--method", "caa",
|
|
122
|
+
"--normalize",
|
|
123
|
+
"--verbose"
|
|
124
|
+
],
|
|
125
|
+
capture_output=True,
|
|
126
|
+
text=True,
|
|
127
|
+
timeout=300
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
assert vector_result.returncode == 0, f"Vector creation failed: {vector_result.stderr}"
|
|
131
|
+
assert os.path.exists(vector_file), "Vector file not created"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
pytest.main([__file__, "-v"])
|
|
File without changes
|