wisent 0.5.14__py3-none-any.whl → 0.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -1
- wisent/cli.py +114 -0
- wisent/core/activations/activations_collector.py +19 -11
- wisent/core/cli/__init__.py +3 -1
- wisent/core/cli/create_steering_vector.py +60 -18
- wisent/core/cli/evaluate_responses.py +14 -8
- wisent/core/cli/generate_pairs_from_task.py +18 -5
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/multi_steer.py +108 -0
- wisent/core/cli/optimize_classification.py +187 -285
- wisent/core/cli/optimize_sample_size.py +78 -0
- wisent/core/cli/optimize_steering.py +354 -53
- wisent/core/cli/tasks.py +274 -9
- wisent/core/errors/__init__.py +0 -0
- wisent/core/errors/error_handler.py +134 -0
- wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +152 -295
- wisent/core/evaluators/rotator.py +22 -8
- wisent/core/main.py +5 -1
- wisent/core/model_persistence.py +4 -19
- wisent/core/models/wisent_model.py +11 -3
- wisent/core/parser.py +4 -3
- wisent/core/parser_arguments/main_parser.py +1 -1
- wisent/core/parser_arguments/multi_steer_parser.py +4 -3
- wisent/core/parser_arguments/optimize_steering_parser.py +4 -0
- wisent/core/sample_size_optimizer_v2.py +1 -1
- wisent/core/steering_optimizer.py +2 -2
- wisent/tests/__init__.py +0 -0
- wisent/tests/examples/__init__.py +0 -0
- wisent/tests/examples/cli/__init__.py +0 -0
- wisent/tests/examples/cli/activations/__init__.py +0 -0
- wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
- wisent/tests/examples/cli/classifier/__init__.py +0 -0
- wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
- wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
- wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
- wisent/tests/examples/cli/evaluation/__init__.py +0 -0
- wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
- wisent/tests/examples/cli/generate/__init__.py +0 -0
- wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
- wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
- wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
- wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
- wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
- wisent/tests/examples/cli/optimizer/__init__.py +0 -0
- wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
- wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
- wisent/tests/examples/cli/steering/__init__.py +0 -0
- wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
- wisent/tests/examples/cli/synthetic/__init__.py +0 -0
- wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/METADATA +3 -1
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/RECORD +59 -29
- wisent/core/agent/diagnose/test_synthetic_classifier.py +0 -71
- /wisent/core/parser_arguments/{test_nonsense_parser.py → nonsense_parser.py} +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/WHEEL +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/entry_points.txt +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.5.14.dist-info → wisent-0.5.15.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,7 @@ import sys
|
|
|
4
4
|
import json
|
|
5
5
|
import time
|
|
6
6
|
import numpy as np
|
|
7
|
+
from wisent.core.evaluators.rotator import EvaluatorRotator
|
|
7
8
|
|
|
8
9
|
def execute_optimize_steering(args):
|
|
9
10
|
"""
|
|
@@ -40,17 +41,17 @@ def execute_optimize_steering(args):
|
|
|
40
41
|
# Initialize data loader
|
|
41
42
|
loader = LMEvalDataLoader()
|
|
42
43
|
|
|
43
|
-
# Execute based on subcommand
|
|
44
|
+
# Execute based on subcommand and return results
|
|
44
45
|
if args.steering_action == 'comprehensive':
|
|
45
|
-
execute_comprehensive(args, model, loader)
|
|
46
|
+
return execute_comprehensive(args, model, loader)
|
|
46
47
|
elif args.steering_action == 'compare-methods':
|
|
47
|
-
execute_compare_methods(args, model, loader)
|
|
48
|
+
return execute_compare_methods(args, model, loader)
|
|
48
49
|
elif args.steering_action == 'optimize-layer':
|
|
49
|
-
execute_optimize_layer(args, model, loader)
|
|
50
|
+
return execute_optimize_layer(args, model, loader)
|
|
50
51
|
elif args.steering_action == 'optimize-strength':
|
|
51
|
-
execute_optimize_strength(args, model, loader)
|
|
52
|
+
return execute_optimize_strength(args, model, loader)
|
|
52
53
|
elif args.steering_action == 'auto':
|
|
53
|
-
execute_auto(args, model, loader)
|
|
54
|
+
return execute_auto(args, model, loader)
|
|
54
55
|
else:
|
|
55
56
|
print(f"\n✗ Unknown steering action: {args.steering_action}")
|
|
56
57
|
sys.exit(1)
|
|
@@ -107,17 +108,29 @@ def execute_comprehensive(args, model, loader):
|
|
|
107
108
|
|
|
108
109
|
train_pairs = result['train_qa_pairs']
|
|
109
110
|
test_pairs = result['test_qa_pairs']
|
|
110
|
-
|
|
111
|
+
|
|
111
112
|
print(f" ✓ Loaded {len(train_pairs.pairs)} train, {len(test_pairs.pairs)} test pairs")
|
|
112
|
-
|
|
113
|
+
|
|
114
|
+
# Initialize evaluator for this task (auto-select based on task_name)
|
|
115
|
+
EvaluatorRotator.discover_evaluators('wisent.core.evaluators.benchmark_specific')
|
|
116
|
+
evaluator = EvaluatorRotator(evaluator=None, task_name=task_name) # None = auto-select
|
|
117
|
+
print(f" ✓ Using evaluator: {evaluator._evaluator.name} (auto-selected for {task_name})")
|
|
118
|
+
|
|
113
119
|
print(f"\n 🔍 Testing CAA method across layers, strengths, AND strategies...")
|
|
114
120
|
print(f" Total configurations: {len(layers_to_test)} layers × {len(strengths_to_test)} strengths × {len(strategies_to_test)} strategies = {len(layers_to_test) * len(strengths_to_test) * len(strategies_to_test)}")
|
|
115
|
-
|
|
121
|
+
|
|
116
122
|
best_score = 0
|
|
117
123
|
best_config = None
|
|
118
124
|
method_results = {}
|
|
119
125
|
configs_tested = 0
|
|
120
|
-
|
|
126
|
+
all_generation_examples = [] # Store generation examples for all configs
|
|
127
|
+
|
|
128
|
+
# Prepare test prompts if generating examples for all configs
|
|
129
|
+
if args.save_all_generation_examples or args.save_generation_examples:
|
|
130
|
+
num_examples = min(args.num_generation_examples, len(test_pairs.pairs))
|
|
131
|
+
example_pairs = test_pairs.pairs[:num_examples]
|
|
132
|
+
print(f" 📝 Will generate {num_examples} example responses per configuration")
|
|
133
|
+
|
|
121
134
|
for layer in layers_to_test:
|
|
122
135
|
for strength in strengths_to_test:
|
|
123
136
|
for strategy in strategies_to_test:
|
|
@@ -161,40 +174,108 @@ def execute_comprehensive(args, model, loader):
|
|
|
161
174
|
caa_method = CAAMethod(kwargs={"normalize": True})
|
|
162
175
|
steering_vector = caa_method.train_for_layer(pos_acts, neg_acts)
|
|
163
176
|
|
|
164
|
-
# Step 2: Evaluate with
|
|
165
|
-
#
|
|
166
|
-
|
|
177
|
+
# Step 2: Evaluate with ACTUAL GENERATION and task evaluator
|
|
178
|
+
# Create steering plan
|
|
179
|
+
from wisent.core.models.core.atoms import SteeringVector, SteeringPlan
|
|
180
|
+
steering_vec = SteeringVector(vector=steering_vector, scale=strength)
|
|
181
|
+
steering_plan = SteeringPlan(
|
|
182
|
+
layers={layer_str: steering_vec},
|
|
183
|
+
layers_description=[f"CAA steering layer={layer}, strength={strength}, strategy={strategy}"]
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Apply steering to model
|
|
187
|
+
model.apply_steering(steering_plan)
|
|
188
|
+
|
|
167
189
|
test_scores = []
|
|
168
|
-
|
|
190
|
+
|
|
169
191
|
for pair in test_pairs.pairs:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
192
|
+
try:
|
|
193
|
+
# Prepare choices for multiple choice evaluation
|
|
194
|
+
choices = [pair.negative_response.content, pair.positive_response.content]
|
|
195
|
+
expected = pair.positive_response.content
|
|
196
|
+
|
|
197
|
+
# Use the Wisent evaluator to check correctness
|
|
198
|
+
# The evaluator will use log likelihood if possible,
|
|
199
|
+
# otherwise fall back to generation
|
|
200
|
+
eval_result = evaluator.evaluate(
|
|
201
|
+
response="", # Not used for log likelihood eval
|
|
202
|
+
expected=expected,
|
|
203
|
+
model=model,
|
|
204
|
+
question=pair.question,
|
|
205
|
+
choices=choices,
|
|
206
|
+
steering_plan=steering_plan
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Convert TRUTHFUL/UNTRUTHFUL to 1.0/0.0
|
|
210
|
+
is_correct = eval_result.ground_truth == "TRUTHFUL"
|
|
211
|
+
test_scores.append(1.0 if is_correct else 0.0)
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
# NO FALLBACK - raise the error immediately
|
|
215
|
+
print(f"\n❌ Evaluation failed for test pair:")
|
|
216
|
+
print(f" Question: {pair.question[:100]}")
|
|
217
|
+
print(f" Error: {e}")
|
|
218
|
+
raise
|
|
219
|
+
|
|
220
|
+
# Clear steering
|
|
221
|
+
model.clear_steering()
|
|
194
222
|
|
|
195
223
|
if len(test_scores) > 0:
|
|
196
224
|
avg_score = np.mean(test_scores)
|
|
197
|
-
|
|
225
|
+
|
|
226
|
+
# Generate examples for this configuration if requested
|
|
227
|
+
if args.save_all_generation_examples:
|
|
228
|
+
config_examples = []
|
|
229
|
+
for idx, pair in enumerate(example_pairs):
|
|
230
|
+
prompt = pair.question
|
|
231
|
+
try:
|
|
232
|
+
# Generate without steering (only once per prompt, reuse if already generated)
|
|
233
|
+
unsteered_response = model.generate(
|
|
234
|
+
[[{"role": "user", "content": prompt}]],
|
|
235
|
+
max_new_tokens=100,
|
|
236
|
+
temperature=0.7,
|
|
237
|
+
use_steering=False
|
|
238
|
+
)[0]
|
|
239
|
+
|
|
240
|
+
# Create steering plan for this config
|
|
241
|
+
from wisent.core.models.core.atoms import SteeringVector, SteeringPlan
|
|
242
|
+
steering_vec = SteeringVector(vector=steering_vector, scale=strength)
|
|
243
|
+
steering_plan = SteeringPlan(
|
|
244
|
+
layers={layer_str: steering_vec},
|
|
245
|
+
layers_description=[f"CAA steering layer={layer}, strength={strength}, strategy={strategy}"]
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Generate with steering
|
|
249
|
+
model.apply_steering(steering_plan)
|
|
250
|
+
steered_response = model.generate(
|
|
251
|
+
[[{"role": "user", "content": prompt}]],
|
|
252
|
+
max_new_tokens=100,
|
|
253
|
+
temperature=0.7,
|
|
254
|
+
use_steering=True,
|
|
255
|
+
steering_plan=steering_plan
|
|
256
|
+
)[0]
|
|
257
|
+
model.clear_steering()
|
|
258
|
+
|
|
259
|
+
config_examples.append({
|
|
260
|
+
'question': prompt,
|
|
261
|
+
'correct_answer': pair.positive_response.content,
|
|
262
|
+
'incorrect_answer': pair.negative_response.content,
|
|
263
|
+
'unsteered_generation': unsteered_response,
|
|
264
|
+
'steered_generation': steered_response
|
|
265
|
+
})
|
|
266
|
+
except Exception as e:
|
|
267
|
+
if args.verbose:
|
|
268
|
+
print(f" ⚠️ Failed to generate example for config layer={layer}, strength={strength}, strategy={strategy}: {e}")
|
|
269
|
+
|
|
270
|
+
# Store this config's examples
|
|
271
|
+
all_generation_examples.append({
|
|
272
|
+
'layer': layer,
|
|
273
|
+
'strength': strength,
|
|
274
|
+
'strategy': strategy,
|
|
275
|
+
'accuracy': avg_score,
|
|
276
|
+
'examples': config_examples
|
|
277
|
+
})
|
|
278
|
+
|
|
198
279
|
if avg_score > best_score:
|
|
199
280
|
best_score = avg_score
|
|
200
281
|
best_config = {
|
|
@@ -203,14 +284,18 @@ def execute_comprehensive(args, model, loader):
|
|
|
203
284
|
'strategy': strategy,
|
|
204
285
|
'accuracy': avg_score
|
|
205
286
|
}
|
|
206
|
-
|
|
287
|
+
|
|
207
288
|
if configs_tested % 10 == 0 and args.verbose:
|
|
208
289
|
print(f" Tested {configs_tested} configurations...", end='\r')
|
|
209
|
-
|
|
290
|
+
|
|
210
291
|
except Exception as e:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
292
|
+
# NO FALLBACK - raise the error immediately
|
|
293
|
+
print(f"\n❌ Configuration test failed:")
|
|
294
|
+
print(f" Layer: {layer}")
|
|
295
|
+
print(f" Strength: {strength}")
|
|
296
|
+
print(f" Strategy: {strategy}")
|
|
297
|
+
print(f" Error: {e}")
|
|
298
|
+
raise
|
|
214
299
|
|
|
215
300
|
if best_config:
|
|
216
301
|
print(f"\n ✅ Best configuration found:")
|
|
@@ -219,7 +304,7 @@ def execute_comprehensive(args, model, loader):
|
|
|
219
304
|
print(f" Strength: {best_config['strength']}")
|
|
220
305
|
print(f" Strategy: {best_config['strategy']} ⭐")
|
|
221
306
|
print(f" Accuracy: {best_config['accuracy']:.3f}")
|
|
222
|
-
|
|
307
|
+
|
|
223
308
|
method_results['CAA'] = {
|
|
224
309
|
'optimal_layer': best_config['layer'],
|
|
225
310
|
'optimal_strength': best_config['strength'],
|
|
@@ -227,6 +312,179 @@ def execute_comprehensive(args, model, loader):
|
|
|
227
312
|
'accuracy': best_config['accuracy'],
|
|
228
313
|
'f1': best_config['accuracy']
|
|
229
314
|
}
|
|
315
|
+
|
|
316
|
+
# Save best steering vector if requested
|
|
317
|
+
if args.save_best_vector:
|
|
318
|
+
import os
|
|
319
|
+
vector_dir = args.save_best_vector
|
|
320
|
+
os.makedirs(vector_dir, exist_ok=True)
|
|
321
|
+
|
|
322
|
+
# Recreate the best steering vector
|
|
323
|
+
best_layer_str = str(best_config['layer'])
|
|
324
|
+
pos_acts_best = []
|
|
325
|
+
neg_acts_best = []
|
|
326
|
+
|
|
327
|
+
for pair in train_pairs.pairs:
|
|
328
|
+
updated_pair = collector.collect_for_pair(
|
|
329
|
+
pair,
|
|
330
|
+
layers=[best_layer_str],
|
|
331
|
+
aggregation=ActivationAggregationStrategy.MEAN_POOLING,
|
|
332
|
+
return_full_sequence=False,
|
|
333
|
+
normalize_layers=False
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
if updated_pair.positive_response.layers_activations and best_layer_str in updated_pair.positive_response.layers_activations:
|
|
337
|
+
act = updated_pair.positive_response.layers_activations[best_layer_str]
|
|
338
|
+
if act is not None:
|
|
339
|
+
pos_acts_best.append(act)
|
|
340
|
+
|
|
341
|
+
if updated_pair.negative_response.layers_activations and best_layer_str in updated_pair.negative_response.layers_activations:
|
|
342
|
+
act = updated_pair.negative_response.layers_activations[best_layer_str]
|
|
343
|
+
if act is not None:
|
|
344
|
+
neg_acts_best.append(act)
|
|
345
|
+
|
|
346
|
+
# Create and save steering vector
|
|
347
|
+
caa_method = CAAMethod(kwargs={"normalize": True})
|
|
348
|
+
best_steering_vector = caa_method.train_for_layer(pos_acts_best, neg_acts_best)
|
|
349
|
+
|
|
350
|
+
vector_path = os.path.join(vector_dir, f"{task_name}_layer{best_config['layer']}.pt")
|
|
351
|
+
torch.save({
|
|
352
|
+
'steering_vector': best_steering_vector,
|
|
353
|
+
'vector': best_steering_vector, # Legacy key
|
|
354
|
+
'layer': best_config['layer'],
|
|
355
|
+
'layer_index': best_config['layer'], # Legacy key
|
|
356
|
+
'strength': best_config['strength'],
|
|
357
|
+
'strategy': best_config['strategy'],
|
|
358
|
+
'method': 'CAA',
|
|
359
|
+
'task': task_name,
|
|
360
|
+
'model': args.model,
|
|
361
|
+
'accuracy': best_config['accuracy']
|
|
362
|
+
}, vector_path)
|
|
363
|
+
print(f" 💾 Saved steering vector to: {vector_path}")
|
|
364
|
+
|
|
365
|
+
# Save generation examples
|
|
366
|
+
if args.save_all_generation_examples:
|
|
367
|
+
# Save examples for ALL configurations
|
|
368
|
+
examples_path = os.path.join(
|
|
369
|
+
args.save_best_vector if args.save_best_vector else "./optimization_results",
|
|
370
|
+
f"{task_name}_all_generation_examples.json"
|
|
371
|
+
)
|
|
372
|
+
os.makedirs(os.path.dirname(examples_path), exist_ok=True)
|
|
373
|
+
|
|
374
|
+
with open(examples_path, 'w') as f:
|
|
375
|
+
json.dump({
|
|
376
|
+
'task': task_name,
|
|
377
|
+
'model': args.model,
|
|
378
|
+
'best_config': best_config,
|
|
379
|
+
'configurations': all_generation_examples
|
|
380
|
+
}, f, indent=2)
|
|
381
|
+
|
|
382
|
+
print(f"\n 💾 Saved generation examples for {len(all_generation_examples)} configurations to: {examples_path}")
|
|
383
|
+
|
|
384
|
+
elif args.save_generation_examples:
|
|
385
|
+
# Save examples only for the best configuration
|
|
386
|
+
print(f"\n 📝 Generating example responses for best configuration...")
|
|
387
|
+
|
|
388
|
+
# Get a few test examples to generate from
|
|
389
|
+
num_examples = min(args.num_generation_examples, len(test_pairs.pairs))
|
|
390
|
+
example_pairs = test_pairs.pairs[:num_examples]
|
|
391
|
+
|
|
392
|
+
generation_examples = []
|
|
393
|
+
|
|
394
|
+
for idx, pair in enumerate(example_pairs):
|
|
395
|
+
# Create prompt from the question
|
|
396
|
+
prompt = pair.question
|
|
397
|
+
|
|
398
|
+
try:
|
|
399
|
+
# Generate without steering
|
|
400
|
+
unsteered_response = model.generate(
|
|
401
|
+
[[{"role": "user", "content": prompt}]],
|
|
402
|
+
max_new_tokens=100,
|
|
403
|
+
temperature=0.7,
|
|
404
|
+
use_steering=False
|
|
405
|
+
)[0]
|
|
406
|
+
|
|
407
|
+
# Recreate best steering vector for generation
|
|
408
|
+
best_layer_str = str(best_config['layer'])
|
|
409
|
+
pos_acts_gen = []
|
|
410
|
+
neg_acts_gen = []
|
|
411
|
+
|
|
412
|
+
# Collect activations again for steering
|
|
413
|
+
for train_pair in train_pairs.pairs[:20]: # Use subset for speed
|
|
414
|
+
updated_pair = collector.collect_for_pair(
|
|
415
|
+
train_pair,
|
|
416
|
+
layers=[best_layer_str],
|
|
417
|
+
aggregation=ActivationAggregationStrategy.MEAN_POOLING,
|
|
418
|
+
return_full_sequence=False,
|
|
419
|
+
normalize_layers=False
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if updated_pair.positive_response.layers_activations and best_layer_str in updated_pair.positive_response.layers_activations:
|
|
423
|
+
act = updated_pair.positive_response.layers_activations[best_layer_str]
|
|
424
|
+
if act is not None:
|
|
425
|
+
pos_acts_gen.append(act)
|
|
426
|
+
|
|
427
|
+
if updated_pair.negative_response.layers_activations and best_layer_str in updated_pair.negative_response.layers_activations:
|
|
428
|
+
act = updated_pair.negative_response.layers_activations[best_layer_str]
|
|
429
|
+
if act is not None:
|
|
430
|
+
neg_acts_gen.append(act)
|
|
431
|
+
|
|
432
|
+
# Create steering vector
|
|
433
|
+
caa_method_gen = CAAMethod(kwargs={"normalize": True})
|
|
434
|
+
steering_vector_gen = caa_method_gen.train_for_layer(pos_acts_gen, neg_acts_gen)
|
|
435
|
+
|
|
436
|
+
# Create SteeringPlan
|
|
437
|
+
from wisent.core.models.core.atoms import SteeringVector, SteeringPlan
|
|
438
|
+
steering_vec = SteeringVector(vector=steering_vector_gen, scale=best_config['strength'])
|
|
439
|
+
steering_plan = SteeringPlan(
|
|
440
|
+
layers={best_layer_str: steering_vec},
|
|
441
|
+
layers_description=[f"CAA steering for {task_name}"]
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Generate with steering
|
|
445
|
+
model.attach(steering_plan)
|
|
446
|
+
steered_response = model.generate(
|
|
447
|
+
[[{"role": "user", "content": prompt}]],
|
|
448
|
+
max_new_tokens=100,
|
|
449
|
+
temperature=0.7,
|
|
450
|
+
use_steering=True,
|
|
451
|
+
steering_plan=steering_plan
|
|
452
|
+
)[0]
|
|
453
|
+
model.detach()
|
|
454
|
+
|
|
455
|
+
generation_examples.append({
|
|
456
|
+
'question': prompt,
|
|
457
|
+
'correct_answer': pair.positive_response.content,
|
|
458
|
+
'incorrect_answer': pair.negative_response.content,
|
|
459
|
+
'unsteered_generation': unsteered_response,
|
|
460
|
+
'steered_generation': steered_response
|
|
461
|
+
})
|
|
462
|
+
|
|
463
|
+
print(f" Generated example {idx+1}/{num_examples}")
|
|
464
|
+
|
|
465
|
+
except Exception as e:
|
|
466
|
+
print(f" ⚠️ Failed to generate example {idx+1}: {e}")
|
|
467
|
+
if args.verbose:
|
|
468
|
+
import traceback
|
|
469
|
+
traceback.print_exc()
|
|
470
|
+
|
|
471
|
+
# Save examples to JSON
|
|
472
|
+
examples_path = os.path.join(
|
|
473
|
+
args.save_best_vector if args.save_best_vector else "./optimization_results",
|
|
474
|
+
f"{task_name}_generation_examples.json"
|
|
475
|
+
)
|
|
476
|
+
os.makedirs(os.path.dirname(examples_path), exist_ok=True)
|
|
477
|
+
|
|
478
|
+
with open(examples_path, 'w') as f:
|
|
479
|
+
json.dump({
|
|
480
|
+
'task': task_name,
|
|
481
|
+
'model': args.model,
|
|
482
|
+
'best_config': best_config,
|
|
483
|
+
'examples': generation_examples
|
|
484
|
+
}, f, indent=2)
|
|
485
|
+
|
|
486
|
+
print(f" 💾 Saved {len(generation_examples)} generation examples to: {examples_path}")
|
|
487
|
+
|
|
230
488
|
else:
|
|
231
489
|
print(f"\n ⚠️ No valid configuration found")
|
|
232
490
|
method_results['CAA'] = {
|
|
@@ -247,12 +505,14 @@ def execute_comprehensive(args, model, loader):
|
|
|
247
505
|
|
|
248
506
|
task_time = time.time() - task_start_time
|
|
249
507
|
print(f"\n ⏱️ Task completed in {task_time:.1f}s (tested {configs_tested} configurations)")
|
|
250
|
-
|
|
508
|
+
|
|
251
509
|
except Exception as e:
|
|
252
|
-
|
|
510
|
+
# NO FALLBACK - raise the error immediately
|
|
511
|
+
print(f"\n❌ Task '{task_name}' optimization failed:")
|
|
512
|
+
print(f" Error: {e}")
|
|
253
513
|
import traceback
|
|
254
514
|
traceback.print_exc()
|
|
255
|
-
|
|
515
|
+
raise
|
|
256
516
|
|
|
257
517
|
# Save results
|
|
258
518
|
print(f"\n{'='*80}")
|
|
@@ -283,6 +543,17 @@ def execute_comprehensive(args, model, loader):
|
|
|
283
543
|
print(f" {task_name:20s} | Method: {config['best_method']:10s} | Layer: {config['best_layer']:2d} | Strength: {config['best_strength']:.2f} | Strategy: {config['best_strategy']:18s}")
|
|
284
544
|
print("-" * 100 + "\n")
|
|
285
545
|
|
|
546
|
+
# Return results for programmatic access
|
|
547
|
+
return {
|
|
548
|
+
"model": args.model,
|
|
549
|
+
"action": "comprehensive",
|
|
550
|
+
"methods_tested": args.methods,
|
|
551
|
+
"tasks_optimized": list(all_results.keys()),
|
|
552
|
+
"results": all_results,
|
|
553
|
+
"results_file": results_file,
|
|
554
|
+
"optimization_dimensions": ['layer', 'strength', 'strategy']
|
|
555
|
+
}
|
|
556
|
+
|
|
286
557
|
|
|
287
558
|
def get_strategy_weight(strategy: str, position: float) -> float:
|
|
288
559
|
"""
|
|
@@ -318,7 +589,7 @@ def execute_compare_methods(args, model, loader):
|
|
|
318
589
|
print(f"🔍 Comparing steering methods for task: {args.task}\n")
|
|
319
590
|
print(f" Methods: {', '.join(args.methods)}")
|
|
320
591
|
print(f" Limit: {args.limit} samples\n")
|
|
321
|
-
|
|
592
|
+
|
|
322
593
|
result = loader._load_one_task(
|
|
323
594
|
task_name=args.task,
|
|
324
595
|
split_ratio=0.8,
|
|
@@ -327,38 +598,68 @@ def execute_compare_methods(args, model, loader):
|
|
|
327
598
|
training_limit=None,
|
|
328
599
|
testing_limit=None
|
|
329
600
|
)
|
|
330
|
-
|
|
601
|
+
|
|
331
602
|
print(f"✅ Loaded {len(result['train_qa_pairs'].pairs)} train pairs\n")
|
|
332
603
|
print("⚠️ Full method comparison requires implementation of HPR, DAC, BiPO, KSteering")
|
|
333
604
|
print(" Currently only CAA is fully implemented")
|
|
334
605
|
|
|
606
|
+
return {
|
|
607
|
+
"action": "compare-methods",
|
|
608
|
+
"task": args.task,
|
|
609
|
+
"methods": args.methods,
|
|
610
|
+
"status": "not_fully_implemented"
|
|
611
|
+
}
|
|
612
|
+
|
|
335
613
|
|
|
336
614
|
def execute_optimize_layer(args, model, loader):
|
|
337
615
|
"""Execute layer optimization."""
|
|
338
616
|
print(f"🎯 Optimizing steering layer for task: {args.task}\n")
|
|
339
617
|
print(f" Method: {args.method}")
|
|
340
618
|
print(f" Strength: {args.strength}\n")
|
|
341
|
-
|
|
619
|
+
|
|
342
620
|
print("⚠️ Layer optimization not yet fully implemented")
|
|
343
621
|
print(f" This would optimize layer for {args.method} method")
|
|
344
622
|
|
|
623
|
+
return {
|
|
624
|
+
"action": "optimize-layer",
|
|
625
|
+
"task": args.task,
|
|
626
|
+
"method": args.method,
|
|
627
|
+
"strength": args.strength,
|
|
628
|
+
"status": "not_fully_implemented"
|
|
629
|
+
}
|
|
630
|
+
|
|
345
631
|
|
|
346
632
|
def execute_optimize_strength(args, model, loader):
|
|
347
633
|
"""Execute strength optimization."""
|
|
348
634
|
print(f"💪 Optimizing steering strength for task: {args.task}\n")
|
|
349
635
|
print(f" Method: {args.method}")
|
|
350
636
|
print(f" Strength range: {args.strength_range[0]} to {args.strength_range[1]}\n")
|
|
351
|
-
|
|
637
|
+
|
|
352
638
|
print("⚠️ Strength optimization not yet fully implemented")
|
|
353
639
|
print(f" This would optimize strength for {args.method} method")
|
|
354
640
|
|
|
641
|
+
return {
|
|
642
|
+
"action": "optimize-strength",
|
|
643
|
+
"task": args.task,
|
|
644
|
+
"method": args.method,
|
|
645
|
+
"strength_range": args.strength_range,
|
|
646
|
+
"status": "not_fully_implemented"
|
|
647
|
+
}
|
|
648
|
+
|
|
355
649
|
|
|
356
650
|
def execute_auto(args, model, loader):
|
|
357
651
|
"""Execute automatic optimization based on classification config."""
|
|
358
652
|
print(f"🤖 Running automatic steering optimization...\n")
|
|
359
653
|
print(f" Methods: {', '.join(args.methods)}")
|
|
360
654
|
print(f" Strength range: {args.strength_range}\n")
|
|
361
|
-
|
|
655
|
+
|
|
362
656
|
print("⚠️ Auto optimization not yet fully implemented")
|
|
363
657
|
print(" This would use classification results to guide steering optimization")
|
|
364
658
|
|
|
659
|
+
return {
|
|
660
|
+
"action": "auto",
|
|
661
|
+
"methods": args.methods,
|
|
662
|
+
"strength_range": args.strength_range,
|
|
663
|
+
"status": "not_fully_implemented"
|
|
664
|
+
}
|
|
665
|
+
|