wisent 0.1.1__py3-none-any.whl ā 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.1.dist-info/METADATA +67 -0
- wisent-0.5.1.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info ā wisent-0.5.1.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info ā wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info ā wisent-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1297 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Steering Parameter Optimizer for Wisent-Guard.
|
|
3
|
+
|
|
4
|
+
Optimizes steering-specific parameters including:
|
|
5
|
+
1. Optimal steering layer (may differ from classification layer)
|
|
6
|
+
2. Optimal steering strength and dynamics
|
|
7
|
+
3. Steering method selection and configuration
|
|
8
|
+
4. Task-specific steering parameter tuning
|
|
9
|
+
|
|
10
|
+
This module builds on top of classification optimization to find optimal
|
|
11
|
+
steering configurations for each model and task.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import json
|
|
16
|
+
import time
|
|
17
|
+
import os
|
|
18
|
+
import numpy as np
|
|
19
|
+
from datetime import datetime
|
|
20
|
+
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
21
|
+
from dataclasses import dataclass, asdict
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
from .model_config_manager import ModelConfigManager
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_default_steering_configs() -> List['SteeringMethodConfig']:
|
|
31
|
+
"""Get default steering method configurations with parameter variations."""
|
|
32
|
+
return [
|
|
33
|
+
# CAA variations
|
|
34
|
+
SteeringMethodConfig(
|
|
35
|
+
name="CAA",
|
|
36
|
+
method=SteeringMethod.CAA,
|
|
37
|
+
params={}
|
|
38
|
+
),
|
|
39
|
+
SteeringMethodConfig(
|
|
40
|
+
name="CAA_L2",
|
|
41
|
+
method=SteeringMethod.CAA,
|
|
42
|
+
params={"normalization_method": "l2_unit"}
|
|
43
|
+
),
|
|
44
|
+
|
|
45
|
+
# HPR variations
|
|
46
|
+
SteeringMethodConfig(
|
|
47
|
+
name="HPR",
|
|
48
|
+
method=SteeringMethod.HPR,
|
|
49
|
+
params={"hpr_beta": 1.0}
|
|
50
|
+
),
|
|
51
|
+
SteeringMethodConfig(
|
|
52
|
+
name="HPR_Beta0.5",
|
|
53
|
+
method=SteeringMethod.HPR,
|
|
54
|
+
params={"hpr_beta": 0.5}
|
|
55
|
+
),
|
|
56
|
+
|
|
57
|
+
# BiPO variations
|
|
58
|
+
SteeringMethodConfig(
|
|
59
|
+
name="BiPO",
|
|
60
|
+
method=SteeringMethod.BIPO,
|
|
61
|
+
params={"bipo_beta": 0.1, "bipo_epochs": 50}
|
|
62
|
+
),
|
|
63
|
+
SteeringMethodConfig(
|
|
64
|
+
name="BiPO_Beta0.05",
|
|
65
|
+
method=SteeringMethod.BIPO,
|
|
66
|
+
params={"bipo_beta": 0.05, "bipo_epochs": 50}
|
|
67
|
+
),
|
|
68
|
+
|
|
69
|
+
# KSteering variations
|
|
70
|
+
SteeringMethodConfig(
|
|
71
|
+
name="KSteering",
|
|
72
|
+
method=SteeringMethod.KSTEERING,
|
|
73
|
+
params={
|
|
74
|
+
"ksteering_alpha": 5.0,
|
|
75
|
+
"ksteering_target_labels": "0",
|
|
76
|
+
"ksteering_avoid_labels": ""
|
|
77
|
+
}
|
|
78
|
+
),
|
|
79
|
+
SteeringMethodConfig(
|
|
80
|
+
name="KSteering_Alpha3",
|
|
81
|
+
method=SteeringMethod.KSTEERING,
|
|
82
|
+
params={
|
|
83
|
+
"ksteering_alpha": 3.0,
|
|
84
|
+
"ksteering_target_labels": "0",
|
|
85
|
+
"ksteering_avoid_labels": ""
|
|
86
|
+
}
|
|
87
|
+
),
|
|
88
|
+
|
|
89
|
+
# DAC variations
|
|
90
|
+
SteeringMethodConfig(
|
|
91
|
+
name="DAC",
|
|
92
|
+
method=SteeringMethod.DAC,
|
|
93
|
+
params={
|
|
94
|
+
"dac_dynamic_control": True,
|
|
95
|
+
"dac_entropy_threshold": 1.0
|
|
96
|
+
}
|
|
97
|
+
),
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SteeringMethod(Enum):
|
|
102
|
+
"""Available steering methods for optimization."""
|
|
103
|
+
CAA = "CAA"
|
|
104
|
+
HPR = "HPR"
|
|
105
|
+
DAC = "DAC"
|
|
106
|
+
BIPO = "BiPO"
|
|
107
|
+
KSTEERING = "KSteering"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class SteeringMethodConfig:
|
|
112
|
+
"""Configuration for a specific steering method with parameter variations."""
|
|
113
|
+
name: str # Display name like "CAA_L2"
|
|
114
|
+
method: SteeringMethod
|
|
115
|
+
params: Dict[str, Any] # Method-specific parameters
|
|
116
|
+
|
|
117
|
+
def __post_init__(self):
|
|
118
|
+
"""Ensure method is SteeringMethod enum."""
|
|
119
|
+
if isinstance(self.method, str):
|
|
120
|
+
self.method = SteeringMethod(self.method)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class SteeringOptimizationResult:
|
|
125
|
+
"""Results from optimizing steering parameters for a single task."""
|
|
126
|
+
task_name: str
|
|
127
|
+
best_steering_layer: int
|
|
128
|
+
best_steering_method: str
|
|
129
|
+
best_steering_strength: float
|
|
130
|
+
optimal_parameters: Dict[str, Any] # Method-specific parameters
|
|
131
|
+
steering_effectiveness_score: float # How well steering changes outputs
|
|
132
|
+
classification_accuracy_impact: float # Impact on classification performance
|
|
133
|
+
optimization_time_seconds: float
|
|
134
|
+
total_configurations_tested: int
|
|
135
|
+
error_message: Optional[str] = None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class SteeringOptimizationSummary:
|
|
140
|
+
"""Summary of steering optimization across tasks/methods."""
|
|
141
|
+
model_name: str
|
|
142
|
+
optimization_type: str # "single_task", "multi_task", "method_comparison"
|
|
143
|
+
total_configurations_tested: int
|
|
144
|
+
optimization_time_minutes: float
|
|
145
|
+
best_overall_method: str
|
|
146
|
+
best_overall_layer: int
|
|
147
|
+
best_overall_strength: float
|
|
148
|
+
method_performance_ranking: Dict[str, float] # method -> effectiveness score
|
|
149
|
+
layer_effectiveness_analysis: Dict[int, float] # layer -> avg effectiveness
|
|
150
|
+
task_results: List[SteeringOptimizationResult]
|
|
151
|
+
optimization_date: str
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class SteeringOptimizer:
|
|
155
|
+
"""
|
|
156
|
+
Framework for optimizing steering parameters.
|
|
157
|
+
|
|
158
|
+
This class provides the structure for steering optimization but requires
|
|
159
|
+
implementation of the actual optimization algorithms for each steering method.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(self, model_name: str, device: str = None, verbose: bool = False):
|
|
163
|
+
"""
|
|
164
|
+
Initialize steering optimizer.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
model_name: Name/path of the model to optimize steering for
|
|
168
|
+
device: Device to run optimization on
|
|
169
|
+
verbose: Enable verbose logging
|
|
170
|
+
"""
|
|
171
|
+
self.model_name = model_name
|
|
172
|
+
self.device = device
|
|
173
|
+
self.verbose = verbose
|
|
174
|
+
self.config_manager = ModelConfigManager()
|
|
175
|
+
|
|
176
|
+
# Load classification parameters if available (steering often builds on classification)
|
|
177
|
+
self.classification_config = self.config_manager.load_model_config(model_name)
|
|
178
|
+
if self.classification_config:
|
|
179
|
+
self.base_classification_layer = self.classification_config.get("optimal_parameters", {}).get("classification_layer")
|
|
180
|
+
logger.info(f"š Found existing classification layer: {self.base_classification_layer}")
|
|
181
|
+
else:
|
|
182
|
+
self.base_classification_layer = None
|
|
183
|
+
logger.warning("ā ļø No existing classification configuration found")
|
|
184
|
+
|
|
185
|
+
def optimize_steering_method_comparison(
|
|
186
|
+
self,
|
|
187
|
+
task_name: str,
|
|
188
|
+
methods_to_test: Optional[Union[List[SteeringMethod], List[SteeringMethodConfig]]] = None,
|
|
189
|
+
layer_range: Optional[str] = None,
|
|
190
|
+
strength_range: Optional[List[float]] = None,
|
|
191
|
+
limit: int = 100,
|
|
192
|
+
max_time_minutes: float = 30.0,
|
|
193
|
+
split_ratio: float = 0.8
|
|
194
|
+
) -> SteeringOptimizationSummary:
|
|
195
|
+
"""
|
|
196
|
+
Compare different steering methods to find the best one for a task.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
task_name: Task to optimize steering for
|
|
200
|
+
methods_to_test: List of steering methods to compare
|
|
201
|
+
layer_range: Range of layers to test for steering
|
|
202
|
+
strength_range: Range of steering strengths to test
|
|
203
|
+
limit: Maximum samples for testing
|
|
204
|
+
max_time_minutes: Maximum optimization time
|
|
205
|
+
split_ratio: Train/test split ratio
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
SteeringOptimizationSummary with method comparison results
|
|
209
|
+
"""
|
|
210
|
+
# Handle both old-style method list and new config list
|
|
211
|
+
if methods_to_test is None:
|
|
212
|
+
# Use all default configurations
|
|
213
|
+
method_configs = get_default_steering_configs()
|
|
214
|
+
else:
|
|
215
|
+
method_configs = []
|
|
216
|
+
for item in methods_to_test:
|
|
217
|
+
if isinstance(item, SteeringMethodConfig):
|
|
218
|
+
method_configs.append(item)
|
|
219
|
+
elif isinstance(item, SteeringMethod):
|
|
220
|
+
# Convert simple method to config with default params
|
|
221
|
+
method_configs.append(SteeringMethodConfig(
|
|
222
|
+
name=item.value,
|
|
223
|
+
method=item,
|
|
224
|
+
params={}
|
|
225
|
+
))
|
|
226
|
+
elif isinstance(item, str):
|
|
227
|
+
# Convert string to SteeringMethod enum
|
|
228
|
+
try:
|
|
229
|
+
method = SteeringMethod(item)
|
|
230
|
+
method_configs.append(SteeringMethodConfig(
|
|
231
|
+
name=method.value,
|
|
232
|
+
method=method,
|
|
233
|
+
params={}
|
|
234
|
+
))
|
|
235
|
+
except ValueError:
|
|
236
|
+
logger.warning(f"Unknown steering method: {item}")
|
|
237
|
+
else:
|
|
238
|
+
logger.warning(f"Unknown method type: {type(item)}, value: {item}")
|
|
239
|
+
|
|
240
|
+
if strength_range is None:
|
|
241
|
+
strength_range = [0.5, 1.0, 1.5, 2.0]
|
|
242
|
+
|
|
243
|
+
logger.info(f"šÆ Comparing {len(method_configs)} steering method configurations for task: {task_name}")
|
|
244
|
+
|
|
245
|
+
start_time = time.time()
|
|
246
|
+
task_results = []
|
|
247
|
+
all_results = {}
|
|
248
|
+
|
|
249
|
+
# Determine layer search range
|
|
250
|
+
if layer_range:
|
|
251
|
+
layers_to_test = self._parse_layer_range(layer_range)
|
|
252
|
+
elif self.base_classification_layer:
|
|
253
|
+
# Search around classification layer
|
|
254
|
+
min_layer = max(1, self.base_classification_layer - 2)
|
|
255
|
+
max_layer = min(32, self.base_classification_layer + 2) # Assume max 32 layers
|
|
256
|
+
layers_to_test = list(range(min_layer, max_layer + 1))
|
|
257
|
+
else:
|
|
258
|
+
# Default range for common models
|
|
259
|
+
layers_to_test = [10, 12, 14, 16, 18, 20]
|
|
260
|
+
|
|
261
|
+
configurations_tested = 0
|
|
262
|
+
best_overall_score = 0.0
|
|
263
|
+
best_overall_config = None
|
|
264
|
+
|
|
265
|
+
# Test each method configuration
|
|
266
|
+
for method_config in method_configs:
|
|
267
|
+
method_results = []
|
|
268
|
+
|
|
269
|
+
for layer in layers_to_test:
|
|
270
|
+
for strength in strength_range:
|
|
271
|
+
if time.time() - start_time > max_time_minutes * 60:
|
|
272
|
+
logger.warning(f"ā° Time limit reached, stopping optimization")
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
try:
|
|
276
|
+
# Run evaluation for this configuration
|
|
277
|
+
score = self._evaluate_steering_configuration(
|
|
278
|
+
task_name=task_name,
|
|
279
|
+
method=method_config.method,
|
|
280
|
+
layer=layer,
|
|
281
|
+
strength=strength,
|
|
282
|
+
limit=limit,
|
|
283
|
+
split_ratio=split_ratio,
|
|
284
|
+
method_params=method_config.params
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
configurations_tested += 1
|
|
288
|
+
config_result = {
|
|
289
|
+
'method': method_config.name, # Use display name
|
|
290
|
+
'method_type': method_config.method.value,
|
|
291
|
+
'layer': layer,
|
|
292
|
+
'strength': strength,
|
|
293
|
+
'score': score,
|
|
294
|
+
'params': method_config.params
|
|
295
|
+
}
|
|
296
|
+
method_results.append(config_result)
|
|
297
|
+
|
|
298
|
+
if score > best_overall_score:
|
|
299
|
+
best_overall_score = score
|
|
300
|
+
best_overall_config = config_result
|
|
301
|
+
|
|
302
|
+
if self.verbose:
|
|
303
|
+
logger.info(f" {method_config.name} L{layer} S{strength}: {score:.3f}")
|
|
304
|
+
|
|
305
|
+
except Exception as e:
|
|
306
|
+
logger.error(f" Error testing {method_config.name} L{layer} S{strength}: {e}")
|
|
307
|
+
|
|
308
|
+
all_results[method_config.name] = method_results
|
|
309
|
+
|
|
310
|
+
# Analyze results
|
|
311
|
+
method_performance = {}
|
|
312
|
+
layer_effectiveness = {}
|
|
313
|
+
|
|
314
|
+
for method, results in all_results.items():
|
|
315
|
+
if results:
|
|
316
|
+
scores = [r['score'] for r in results]
|
|
317
|
+
method_performance[method] = max(scores)
|
|
318
|
+
|
|
319
|
+
# Aggregate by layer
|
|
320
|
+
for result in results:
|
|
321
|
+
layer = result['layer']
|
|
322
|
+
if layer not in layer_effectiveness:
|
|
323
|
+
layer_effectiveness[layer] = []
|
|
324
|
+
layer_effectiveness[layer].append(result['score'])
|
|
325
|
+
|
|
326
|
+
# Average layer effectiveness
|
|
327
|
+
for layer in layer_effectiveness:
|
|
328
|
+
layer_effectiveness[layer] = sum(layer_effectiveness[layer]) / len(layer_effectiveness[layer])
|
|
329
|
+
|
|
330
|
+
# Create optimization result
|
|
331
|
+
optimization_time = time.time() - start_time
|
|
332
|
+
|
|
333
|
+
if best_overall_config:
|
|
334
|
+
result = SteeringOptimizationResult(
|
|
335
|
+
task_name=task_name,
|
|
336
|
+
best_steering_layer=best_overall_config['layer'],
|
|
337
|
+
best_steering_method=best_overall_config['method'],
|
|
338
|
+
best_steering_strength=best_overall_config['strength'],
|
|
339
|
+
optimal_parameters={
|
|
340
|
+
'split_ratio': split_ratio,
|
|
341
|
+
'limit': limit,
|
|
342
|
+
**best_overall_config.get('params', {})
|
|
343
|
+
},
|
|
344
|
+
steering_effectiveness_score=best_overall_config['score'],
|
|
345
|
+
classification_accuracy_impact=0.0, # Not measured here
|
|
346
|
+
optimization_time_seconds=optimization_time,
|
|
347
|
+
total_configurations_tested=configurations_tested
|
|
348
|
+
)
|
|
349
|
+
task_results.append(result)
|
|
350
|
+
|
|
351
|
+
summary = SteeringOptimizationSummary(
|
|
352
|
+
model_name=self.model_name,
|
|
353
|
+
optimization_type="method_comparison",
|
|
354
|
+
total_configurations_tested=configurations_tested,
|
|
355
|
+
optimization_time_minutes=optimization_time / 60,
|
|
356
|
+
best_overall_method=best_overall_config['method'] if best_overall_config else "none",
|
|
357
|
+
best_overall_layer=best_overall_config['layer'] if best_overall_config else 0,
|
|
358
|
+
best_overall_strength=best_overall_config['strength'] if best_overall_config else 0.0,
|
|
359
|
+
method_performance_ranking=method_performance,
|
|
360
|
+
layer_effectiveness_analysis=layer_effectiveness,
|
|
361
|
+
task_results=task_results,
|
|
362
|
+
optimization_date=datetime.now().isoformat()
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Save the results
|
|
366
|
+
self._save_steering_optimization_results(summary)
|
|
367
|
+
|
|
368
|
+
return summary
|
|
369
|
+
|
|
370
|
+
def optimize_steering_layer(
|
|
371
|
+
self,
|
|
372
|
+
task_name: str,
|
|
373
|
+
steering_method: SteeringMethod = SteeringMethod.CAA,
|
|
374
|
+
layer_search_range: Optional[Tuple[int, int]] = None,
|
|
375
|
+
strength: float = 1.0,
|
|
376
|
+
limit: int = 100
|
|
377
|
+
) -> SteeringOptimizationResult:
|
|
378
|
+
"""
|
|
379
|
+
Find optimal steering layer for a specific method and task.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
task_name: Task to optimize for
|
|
383
|
+
steering_method: Steering method to use
|
|
384
|
+
layer_search_range: (min_layer, max_layer) to search
|
|
385
|
+
strength: Fixed steering strength to use during layer search
|
|
386
|
+
limit: Maximum samples for testing
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
SteeringOptimizationResult with optimal layer
|
|
390
|
+
"""
|
|
391
|
+
logger.info(f"š Optimizing steering layer for {task_name} using {steering_method.value}")
|
|
392
|
+
|
|
393
|
+
if layer_search_range is None:
|
|
394
|
+
# Default: search around classification layer if available
|
|
395
|
+
if self.base_classification_layer:
|
|
396
|
+
min_layer = max(1, self.base_classification_layer - 3)
|
|
397
|
+
max_layer = self.base_classification_layer + 3
|
|
398
|
+
layer_search_range = (min_layer, max_layer)
|
|
399
|
+
else:
|
|
400
|
+
# TODO: Auto-detect model layer count and use reasonable range
|
|
401
|
+
layer_search_range = (10, 20) # Default fallback
|
|
402
|
+
|
|
403
|
+
# TODO: Implement layer optimization logic
|
|
404
|
+
raise NotImplementedError(
|
|
405
|
+
"Steering layer optimization not yet implemented. "
|
|
406
|
+
"This requires implementing steering vector training and "
|
|
407
|
+
"effectiveness measurement across different layers."
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
def optimize_steering_strength(
|
|
411
|
+
self,
|
|
412
|
+
task_name: str,
|
|
413
|
+
steering_method: SteeringMethod = SteeringMethod.CAA,
|
|
414
|
+
layer: Optional[int] = None,
|
|
415
|
+
strength_range: Optional[Tuple[float, float]] = None,
|
|
416
|
+
strength_steps: int = 10,
|
|
417
|
+
limit: int = 100,
|
|
418
|
+
method_params: Optional[Dict[str, Any]] = None
|
|
419
|
+
) -> SteeringOptimizationResult:
|
|
420
|
+
"""
|
|
421
|
+
Find optimal steering strength for a specific method, layer, and task.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
task_name: Task to optimize for
|
|
425
|
+
steering_method: Steering method to use
|
|
426
|
+
layer: Steering layer to use (defaults to classification layer)
|
|
427
|
+
strength_range: (min_strength, max_strength) to search
|
|
428
|
+
strength_steps: Number of strength values to test
|
|
429
|
+
limit: Maximum samples for testing
|
|
430
|
+
|
|
431
|
+
Returns:
|
|
432
|
+
SteeringOptimizationResult with optimal strength
|
|
433
|
+
"""
|
|
434
|
+
if layer is None:
|
|
435
|
+
layer = self.base_classification_layer or 15 # Default fallback
|
|
436
|
+
|
|
437
|
+
if strength_range is None:
|
|
438
|
+
strength_range = (0.1, 2.0) # Default strength range
|
|
439
|
+
|
|
440
|
+
logger.info(f"ā” Optimizing steering strength for {task_name}")
|
|
441
|
+
logger.info(f" Method: {steering_method.value}, Layer: {layer}")
|
|
442
|
+
logger.info(f" Strength range: {strength_range}, Steps: {strength_steps}")
|
|
443
|
+
|
|
444
|
+
# Load steering parameters from config
|
|
445
|
+
import json
|
|
446
|
+
import os
|
|
447
|
+
config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'steering_optimization_parameters.json')
|
|
448
|
+
if os.path.exists(config_path):
|
|
449
|
+
with open(config_path, 'r') as f:
|
|
450
|
+
steering_config = json.load(f)
|
|
451
|
+
else:
|
|
452
|
+
steering_config = {}
|
|
453
|
+
|
|
454
|
+
# Get default layer if not provided
|
|
455
|
+
if layer is None:
|
|
456
|
+
layer = self._get_classification_layer(task_name)
|
|
457
|
+
logger.info(f" Using classification layer: {layer}")
|
|
458
|
+
|
|
459
|
+
# Default strength range from config or fallback
|
|
460
|
+
if strength_range is None:
|
|
461
|
+
default_strengths = steering_config.get('steering_strengths', {}).get('default', [0.5, 1.0, 1.5, 2.0])
|
|
462
|
+
strength_range = (min(default_strengths), max(default_strengths))
|
|
463
|
+
|
|
464
|
+
# Generate strength values to test
|
|
465
|
+
import numpy as np
|
|
466
|
+
strengths = np.linspace(strength_range[0], strength_range[1], strength_steps)
|
|
467
|
+
|
|
468
|
+
# Get method-specific parameters from config if not provided
|
|
469
|
+
if method_params is None:
|
|
470
|
+
method_configs = steering_config.get('steering_methods', [])
|
|
471
|
+
for config in method_configs:
|
|
472
|
+
if config['method'] == steering_method.value and config['name'] == steering_method.value:
|
|
473
|
+
method_params = config.get('params', {})
|
|
474
|
+
break
|
|
475
|
+
if method_params is None:
|
|
476
|
+
method_params = {}
|
|
477
|
+
|
|
478
|
+
results = []
|
|
479
|
+
best_score = -float('inf')
|
|
480
|
+
best_strength = 0.0
|
|
481
|
+
|
|
482
|
+
logger.info(f" Testing {len(strengths)} strength values...")
|
|
483
|
+
|
|
484
|
+
# Test each strength value
|
|
485
|
+
for strength in strengths:
|
|
486
|
+
try:
|
|
487
|
+
# Run evaluation with this strength
|
|
488
|
+
from ..cli import run_task_pipeline
|
|
489
|
+
|
|
490
|
+
# Build kwargs for run_task_pipeline
|
|
491
|
+
pipeline_kwargs = {
|
|
492
|
+
'task_name': task_name,
|
|
493
|
+
'model_name': self.model_name,
|
|
494
|
+
'limit': limit,
|
|
495
|
+
'device': self.device,
|
|
496
|
+
'verbose': False,
|
|
497
|
+
'steering_mode': True,
|
|
498
|
+
'steering_method': steering_method.value,
|
|
499
|
+
'steering_strength': float(strength),
|
|
500
|
+
'layer': str(layer),
|
|
501
|
+
'output_mode': "likelihoods", # Get likelihoods for evaluation
|
|
502
|
+
'allow_small_dataset': True
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
# Add method-specific parameters
|
|
506
|
+
for param, value in method_params.items():
|
|
507
|
+
pipeline_kwargs[param] = value
|
|
508
|
+
|
|
509
|
+
result = run_task_pipeline(**pipeline_kwargs)
|
|
510
|
+
|
|
511
|
+
# Extract score from evaluation results
|
|
512
|
+
score = 0.0
|
|
513
|
+
steering_effect = 0.0
|
|
514
|
+
accuracy = 0.0
|
|
515
|
+
|
|
516
|
+
if isinstance(result, dict):
|
|
517
|
+
# Get evaluation results - try both nested and direct access
|
|
518
|
+
eval_results = None
|
|
519
|
+
|
|
520
|
+
# First try: result[task_name]['evaluation_results']
|
|
521
|
+
task_result = result.get(task_name, {})
|
|
522
|
+
if 'evaluation_results' in task_result:
|
|
523
|
+
eval_results = task_result['evaluation_results']
|
|
524
|
+
# Second try: result['evaluation_results'] (direct from run_task_pipeline)
|
|
525
|
+
elif 'evaluation_results' in result:
|
|
526
|
+
eval_results = result['evaluation_results']
|
|
527
|
+
else:
|
|
528
|
+
eval_results = {}
|
|
529
|
+
|
|
530
|
+
# Get accuracy score (but don't use it as the primary metric)
|
|
531
|
+
accuracy = eval_results.get('accuracy', 0.0)
|
|
532
|
+
if isinstance(accuracy, str) or accuracy is None:
|
|
533
|
+
accuracy = 0.0
|
|
534
|
+
|
|
535
|
+
# Calculate steering effect from likelihood changes
|
|
536
|
+
baseline_likes = eval_results.get('baseline_likelihoods', [])
|
|
537
|
+
steered_likes = eval_results.get('steered_likelihoods', [])
|
|
538
|
+
|
|
539
|
+
if self.verbose:
|
|
540
|
+
logger.debug(f" Found {len(baseline_likes)} baseline and {len(steered_likes)} steered likelihoods")
|
|
541
|
+
if baseline_likes and len(baseline_likes) > 0:
|
|
542
|
+
logger.debug(f" First few baseline likes: {baseline_likes[:3]}")
|
|
543
|
+
logger.debug(f" First few steered likes: {steered_likes[:3]}")
|
|
544
|
+
logger.debug(f" Full eval_results keys: {list(eval_results.keys())}")
|
|
545
|
+
logger.debug(f" Accuracy value: {eval_results.get('accuracy')}")
|
|
546
|
+
|
|
547
|
+
# Check if we're getting the right data structure
|
|
548
|
+
if isinstance(result, dict):
|
|
549
|
+
logger.debug(f" Result keys: {list(result.keys())}")
|
|
550
|
+
if 'evaluation_results' in result:
|
|
551
|
+
logger.debug(f" Direct evaluation_results found")
|
|
552
|
+
|
|
553
|
+
if baseline_likes and steered_likes:
|
|
554
|
+
# Filter out inf and nan values
|
|
555
|
+
valid_pairs = []
|
|
556
|
+
for b, s in zip(baseline_likes, steered_likes):
|
|
557
|
+
if np.isfinite(b) and np.isfinite(s):
|
|
558
|
+
valid_pairs.append((b, s))
|
|
559
|
+
|
|
560
|
+
if valid_pairs:
|
|
561
|
+
changes = [abs(s - b) for b, s in valid_pairs]
|
|
562
|
+
steering_effect = sum(changes) / len(changes) if changes else 0.0
|
|
563
|
+
|
|
564
|
+
# Cap steering effect to prevent infinity
|
|
565
|
+
steering_effect = min(steering_effect, 100.0)
|
|
566
|
+
|
|
567
|
+
# Also calculate how many preferences changed
|
|
568
|
+
preference_changes = 0
|
|
569
|
+
for i in range(0, len(baseline_likes), 2): # Assuming binary choices
|
|
570
|
+
if i+1 < len(baseline_likes):
|
|
571
|
+
if np.isfinite(baseline_likes[i]) and np.isfinite(baseline_likes[i+1]) and \
|
|
572
|
+
np.isfinite(steered_likes[i]) and np.isfinite(steered_likes[i+1]):
|
|
573
|
+
baseline_pref = 0 if baseline_likes[i] > baseline_likes[i+1] else 1
|
|
574
|
+
steered_pref = 0 if steered_likes[i] > steered_likes[i+1] else 1
|
|
575
|
+
if baseline_pref != steered_pref:
|
|
576
|
+
preference_changes += 1
|
|
577
|
+
|
|
578
|
+
# Use steering effect as the primary score
|
|
579
|
+
score = steering_effect
|
|
580
|
+
|
|
581
|
+
# Add bonus if accuracy is valid and good
|
|
582
|
+
if np.isfinite(accuracy) and accuracy > 0.5:
|
|
583
|
+
score += accuracy * 0.5
|
|
584
|
+
else:
|
|
585
|
+
# No valid likelihood pairs
|
|
586
|
+
score = 0.0
|
|
587
|
+
steering_effect = 0.0
|
|
588
|
+
else:
|
|
589
|
+
# Fallback to accuracy if no likelihood data
|
|
590
|
+
score = accuracy if np.isfinite(accuracy) else 0.0
|
|
591
|
+
|
|
592
|
+
results.append({
|
|
593
|
+
'strength': float(strength),
|
|
594
|
+
'score': score,
|
|
595
|
+
'steering_effect': steering_effect,
|
|
596
|
+
'evaluation_results': eval_results if isinstance(result, dict) else {}
|
|
597
|
+
})
|
|
598
|
+
|
|
599
|
+
if score > best_score:
|
|
600
|
+
best_score = score
|
|
601
|
+
best_strength = float(strength)
|
|
602
|
+
|
|
603
|
+
logger.info(f" Strength {strength:.2f}: score={score:.3f}, effect={steering_effect:.3f}, accuracy={accuracy:.3f}")
|
|
604
|
+
|
|
605
|
+
except Exception as e:
|
|
606
|
+
logger.error(f" Error testing strength {strength}: {e}")
|
|
607
|
+
results.append({
|
|
608
|
+
'strength': float(strength),
|
|
609
|
+
'score': 0.0,
|
|
610
|
+
'error': str(e)
|
|
611
|
+
})
|
|
612
|
+
|
|
613
|
+
return SteeringOptimizationResult(
|
|
614
|
+
task_name=task_name,
|
|
615
|
+
best_steering_layer=layer,
|
|
616
|
+
best_steering_method=steering_method.value,
|
|
617
|
+
best_steering_strength=best_strength,
|
|
618
|
+
optimal_parameters={'strength': best_strength},
|
|
619
|
+
steering_effectiveness_score=best_score,
|
|
620
|
+
classification_accuracy_impact=best_score, # Using same score for now
|
|
621
|
+
optimization_time_seconds=0.0, # TODO: Track actual time
|
|
622
|
+
total_configurations_tested=len(results),
|
|
623
|
+
error_message=None
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
def optimize_method_specific_parameters(
|
|
627
|
+
self,
|
|
628
|
+
task_name: str,
|
|
629
|
+
steering_method: SteeringMethod,
|
|
630
|
+
base_layer: Optional[int] = None,
|
|
631
|
+
base_strength: float = 1.0,
|
|
632
|
+
limit: int = 100
|
|
633
|
+
) -> SteeringOptimizationResult:
|
|
634
|
+
"""
|
|
635
|
+
Optimize method-specific parameters for a steering approach.
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
task_name: Task to optimize for
|
|
639
|
+
steering_method: Specific steering method to optimize
|
|
640
|
+
base_layer: Base steering layer to use
|
|
641
|
+
base_strength: Base steering strength to use
|
|
642
|
+
limit: Maximum samples for testing
|
|
643
|
+
|
|
644
|
+
Returns:
|
|
645
|
+
SteeringOptimizationResult with optimized method parameters
|
|
646
|
+
"""
|
|
647
|
+
logger.info(f"š§ Optimizing {steering_method.value}-specific parameters for {task_name}")
|
|
648
|
+
|
|
649
|
+
if steering_method == SteeringMethod.CAA:
|
|
650
|
+
return self._optimize_caa_parameters(task_name, base_layer, base_strength, limit)
|
|
651
|
+
elif steering_method == SteeringMethod.HPR:
|
|
652
|
+
return self._optimize_hpr_parameters(task_name, base_layer, base_strength, limit)
|
|
653
|
+
elif steering_method == SteeringMethod.DAC:
|
|
654
|
+
return self._optimize_dac_parameters(task_name, base_layer, base_strength, limit)
|
|
655
|
+
elif steering_method == SteeringMethod.BIPO:
|
|
656
|
+
return self._optimize_bipo_parameters(task_name, base_layer, base_strength, limit)
|
|
657
|
+
elif steering_method == SteeringMethod.KSTEERING:
|
|
658
|
+
return self._optimize_ksteering_parameters(task_name, base_layer, base_strength, limit)
|
|
659
|
+
else:
|
|
660
|
+
raise ValueError(f"Unknown steering method: {steering_method}")
|
|
661
|
+
|
|
662
|
+
def _optimize_caa_parameters(
|
|
663
|
+
self,
|
|
664
|
+
task_name: str,
|
|
665
|
+
layer: Optional[int],
|
|
666
|
+
strength: float,
|
|
667
|
+
limit: int
|
|
668
|
+
) -> SteeringOptimizationResult:
|
|
669
|
+
"""Optimize CAA (Concept Activation Analysis) specific parameters."""
|
|
670
|
+
# TODO: Implement CAA parameter optimization
|
|
671
|
+
# CAA typically doesn't have many hyperparameters beyond layer/strength
|
|
672
|
+
# but may include normalization options, vector aggregation methods, etc.
|
|
673
|
+
raise NotImplementedError("CAA parameter optimization not yet implemented")
|
|
674
|
+
|
|
675
|
+
def _optimize_hpr_parameters(
|
|
676
|
+
self,
|
|
677
|
+
task_name: str,
|
|
678
|
+
layer: Optional[int],
|
|
679
|
+
strength: float,
|
|
680
|
+
limit: int
|
|
681
|
+
) -> SteeringOptimizationResult:
|
|
682
|
+
"""Optimize HPR (Householder Pseudo-Rotation) specific parameters."""
|
|
683
|
+
# TODO: Implement HPR parameter optimization
|
|
684
|
+
# HPR has beta parameter and potentially rotation-specific settings
|
|
685
|
+
raise NotImplementedError("HPR parameter optimization not yet implemented")
|
|
686
|
+
|
|
687
|
+
def _optimize_dac_parameters(
|
|
688
|
+
self,
|
|
689
|
+
task_name: str,
|
|
690
|
+
layer: Optional[int],
|
|
691
|
+
strength: float,
|
|
692
|
+
limit: int
|
|
693
|
+
) -> SteeringOptimizationResult:
|
|
694
|
+
"""Optimize DAC (Dynamic Activation Composition) specific parameters."""
|
|
695
|
+
# TODO: Implement DAC parameter optimization
|
|
696
|
+
# DAC has dynamic control settings, entropy thresholds, etc.
|
|
697
|
+
raise NotImplementedError("DAC parameter optimization not yet implemented")
|
|
698
|
+
|
|
699
|
+
def _optimize_bipo_parameters(
|
|
700
|
+
self,
|
|
701
|
+
task_name: str,
|
|
702
|
+
layer: Optional[int],
|
|
703
|
+
strength: float,
|
|
704
|
+
limit: int
|
|
705
|
+
) -> SteeringOptimizationResult:
|
|
706
|
+
"""Optimize BiPO (Bi-directional Preference Optimization) specific parameters."""
|
|
707
|
+
# TODO: Implement BiPO parameter optimization
|
|
708
|
+
# BiPO has learning rate, beta, epochs, and other training-specific parameters
|
|
709
|
+
raise NotImplementedError("BiPO parameter optimization not yet implemented")
|
|
710
|
+
|
|
711
|
+
def _optimize_ksteering_parameters(
|
|
712
|
+
self,
|
|
713
|
+
task_name: str,
|
|
714
|
+
layer: Optional[int],
|
|
715
|
+
strength: float,
|
|
716
|
+
limit: int
|
|
717
|
+
) -> SteeringOptimizationResult:
|
|
718
|
+
"""Optimize K-Steering specific parameters."""
|
|
719
|
+
# TODO: Implement K-Steering parameter optimization
|
|
720
|
+
# K-Steering has many parameters: num_labels, hidden_dim, learning_rate,
|
|
721
|
+
# classifier_epochs, target/avoid labels, alpha, etc.
|
|
722
|
+
raise NotImplementedError("K-Steering parameter optimization not yet implemented")
|
|
723
|
+
|
|
724
|
+
def run_comprehensive_steering_optimization(
|
|
725
|
+
self,
|
|
726
|
+
tasks: Optional[List[str]] = None,
|
|
727
|
+
methods: Optional[List[SteeringMethod]] = None,
|
|
728
|
+
limit: int = 100,
|
|
729
|
+
max_time_per_task_minutes: float = 20.0,
|
|
730
|
+
save_results: bool = True
|
|
731
|
+
) -> SteeringOptimizationSummary:
|
|
732
|
+
"""
|
|
733
|
+
Run comprehensive steering optimization across multiple tasks and methods.
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
tasks: List of tasks to optimize (if None, uses classification-optimized tasks)
|
|
737
|
+
methods: List of steering methods to test
|
|
738
|
+
limit: Sample limit per task
|
|
739
|
+
max_time_per_task_minutes: Time limit per task
|
|
740
|
+
save_results: Whether to save results to config
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
SteeringOptimizationSummary with comprehensive results
|
|
744
|
+
"""
|
|
745
|
+
logger.info(f"š Starting comprehensive steering optimization")
|
|
746
|
+
|
|
747
|
+
if tasks is None:
|
|
748
|
+
# Use tasks that were successfully optimized for classification
|
|
749
|
+
if self.classification_config:
|
|
750
|
+
task_overrides = self.classification_config.get("task_specific_overrides", {})
|
|
751
|
+
tasks = list(task_overrides.keys())
|
|
752
|
+
if not tasks:
|
|
753
|
+
logger.warning("No classification-optimized tasks found, using default task set")
|
|
754
|
+
tasks = ["truthfulqa_mc1", "gsm8k", "squad2"] # Default fallback
|
|
755
|
+
else:
|
|
756
|
+
tasks = ["truthfulqa_mc1", "gsm8k", "squad2"] # Default fallback
|
|
757
|
+
|
|
758
|
+
if methods is None:
|
|
759
|
+
methods = [SteeringMethod.CAA, SteeringMethod.HPR] # Start with simpler methods
|
|
760
|
+
|
|
761
|
+
logger.info(f"š Tasks: {tasks}")
|
|
762
|
+
logger.info(f"š§ Methods: [methods.value for method in methods]")
|
|
763
|
+
|
|
764
|
+
# TODO: Implement comprehensive optimization loop
|
|
765
|
+
# This should:
|
|
766
|
+
# 1. For each task and method combination
|
|
767
|
+
# 2. Find optimal layer, strength, and method-specific parameters
|
|
768
|
+
# 3. Measure steering effectiveness vs classification accuracy tradeoff
|
|
769
|
+
# 4. Aggregate results and find best overall parameters
|
|
770
|
+
# 5. Save task-specific steering configurations
|
|
771
|
+
|
|
772
|
+
raise NotImplementedError(
|
|
773
|
+
"Comprehensive steering optimization not yet implemented. "
|
|
774
|
+
"This requires implementing all the individual optimization methods "
|
|
775
|
+
"and result aggregation logic."
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
def _parse_layer_range(self, layer_range: str) -> List[int]:
|
|
779
|
+
"""Parse layer range string like '10-20' or '10,12,14'."""
|
|
780
|
+
if '-' in layer_range:
|
|
781
|
+
start, end = map(int, layer_range.split('-'))
|
|
782
|
+
return list(range(start, end + 1))
|
|
783
|
+
elif ',' in layer_range:
|
|
784
|
+
return [int(x.strip()) for x in layer_range.split(',')]
|
|
785
|
+
else:
|
|
786
|
+
return [int(layer_range)]
|
|
787
|
+
|
|
788
|
+
def _evaluate_steering_configuration(
|
|
789
|
+
self,
|
|
790
|
+
task_name: str,
|
|
791
|
+
method: SteeringMethod,
|
|
792
|
+
layer: int,
|
|
793
|
+
strength: float,
|
|
794
|
+
limit: int,
|
|
795
|
+
split_ratio: float,
|
|
796
|
+
method_params: Optional[Dict[str, Any]] = None
|
|
797
|
+
) -> float:
|
|
798
|
+
"""
|
|
799
|
+
Evaluate a single steering configuration and return its effectiveness score.
|
|
800
|
+
|
|
801
|
+
Args:
|
|
802
|
+
method_params: Additional method-specific parameters
|
|
803
|
+
|
|
804
|
+
Returns:
|
|
805
|
+
Effectiveness score (0.0 to 1.0)
|
|
806
|
+
"""
|
|
807
|
+
try:
|
|
808
|
+
# Import CLI runner to test configuration
|
|
809
|
+
from wisent_guard.cli import run_task_pipeline
|
|
810
|
+
|
|
811
|
+
# Prepare kwargs with method-specific parameters
|
|
812
|
+
kwargs = {
|
|
813
|
+
'task_name': task_name,
|
|
814
|
+
'model_name': self.model_name,
|
|
815
|
+
'layer': str(layer),
|
|
816
|
+
'limit': limit,
|
|
817
|
+
'steering_mode': True,
|
|
818
|
+
'steering_method': method.value,
|
|
819
|
+
'steering_strength': strength,
|
|
820
|
+
'split_ratio': split_ratio,
|
|
821
|
+
'device': self.device,
|
|
822
|
+
'verbose': False,
|
|
823
|
+
'allow_small_dataset': True
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
# Add method-specific parameters
|
|
827
|
+
if method_params:
|
|
828
|
+
# Map parameter names to CLI argument names
|
|
829
|
+
param_mapping = {
|
|
830
|
+
'normalization_method': 'normalization_method',
|
|
831
|
+
'hpr_beta': 'hpr_beta',
|
|
832
|
+
'dac_dynamic_control': 'dac_dynamic_control',
|
|
833
|
+
'dac_entropy_threshold': 'dac_entropy_threshold',
|
|
834
|
+
'bipo_beta': 'bipo_beta',
|
|
835
|
+
'bipo_epochs': 'bipo_epochs',
|
|
836
|
+
'ksteering_alpha': 'ksteering_alpha',
|
|
837
|
+
'ksteering_target_labels': 'ksteering_target_labels',
|
|
838
|
+
'ksteering_avoid_labels': 'ksteering_avoid_labels'
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
for param_key, param_value in method_params.items():
|
|
842
|
+
if param_key in param_mapping:
|
|
843
|
+
kwargs[param_mapping[param_key]] = param_value
|
|
844
|
+
|
|
845
|
+
# Run steering evaluation
|
|
846
|
+
result = run_task_pipeline(**kwargs)
|
|
847
|
+
|
|
848
|
+
# Extract evaluation score
|
|
849
|
+
# Priority: accuracy > likelihood change > 0.0
|
|
850
|
+
if 'accuracy' in result and result['accuracy'] != 'N/A':
|
|
851
|
+
return float(result['accuracy'])
|
|
852
|
+
elif 'evaluation_results' in result:
|
|
853
|
+
eval_results = result['evaluation_results']
|
|
854
|
+
if 'accuracy' in eval_results and eval_results['accuracy'] != 'N/A':
|
|
855
|
+
return float(eval_results['accuracy'])
|
|
856
|
+
# Could also use likelihood changes as a metric
|
|
857
|
+
|
|
858
|
+
return 0.0
|
|
859
|
+
|
|
860
|
+
except Exception as e:
|
|
861
|
+
logger.error(f"Configuration evaluation failed: {e}")
|
|
862
|
+
return 0.0
|
|
863
|
+
|
|
864
|
+
def _save_steering_optimization_results(self, summary: SteeringOptimizationSummary):
|
|
865
|
+
"""Save optimization results to configuration."""
|
|
866
|
+
config = self.config_manager.load_model_config(self.model_name) or {
|
|
867
|
+
'model_name': self.model_name,
|
|
868
|
+
'created_date': datetime.now().isoformat(),
|
|
869
|
+
'config_version': '2.0'
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
# Add steering optimization results
|
|
873
|
+
if 'steering_optimization' not in config:
|
|
874
|
+
config['steering_optimization'] = {}
|
|
875
|
+
|
|
876
|
+
# Save overall best configuration
|
|
877
|
+
config['steering_optimization']['best_method'] = summary.best_overall_method
|
|
878
|
+
config['steering_optimization']['best_layer'] = summary.best_overall_layer
|
|
879
|
+
config['steering_optimization']['best_strength'] = summary.best_overall_strength
|
|
880
|
+
config['steering_optimization']['optimization_date'] = summary.optimization_date
|
|
881
|
+
config['steering_optimization']['method_ranking'] = summary.method_performance_ranking
|
|
882
|
+
|
|
883
|
+
# Save task-specific results
|
|
884
|
+
if 'task_specific_steering' not in config:
|
|
885
|
+
config['task_specific_steering'] = {}
|
|
886
|
+
|
|
887
|
+
for task_result in summary.task_results:
|
|
888
|
+
config['task_specific_steering'][task_result.task_name] = {
|
|
889
|
+
'method': task_result.best_steering_method,
|
|
890
|
+
'layer': task_result.best_steering_layer,
|
|
891
|
+
'strength': task_result.best_steering_strength,
|
|
892
|
+
'score': task_result.steering_effectiveness_score,
|
|
893
|
+
'parameters': task_result.optimal_parameters
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
# Update configuration
|
|
897
|
+
self.config_manager.update_model_config(self.model_name, config)
|
|
898
|
+
logger.info(f"ā
Steering optimization results saved for {self.model_name}")
|
|
899
|
+
|
|
900
|
+
def load_optimal_steering_config(self, task_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
|
901
|
+
"""
|
|
902
|
+
Load optimal steering configuration for a model/task.
|
|
903
|
+
|
|
904
|
+
Args:
|
|
905
|
+
task_name: Optional task name for task-specific configuration
|
|
906
|
+
|
|
907
|
+
Returns:
|
|
908
|
+
Dictionary with optimal steering parameters or None
|
|
909
|
+
"""
|
|
910
|
+
config = self.config_manager.load_model_config(self.model_name)
|
|
911
|
+
if not config:
|
|
912
|
+
return None
|
|
913
|
+
|
|
914
|
+
# Check for task-specific configuration first
|
|
915
|
+
if task_name and 'task_specific_steering' in config:
|
|
916
|
+
task_config = config['task_specific_steering'].get(task_name)
|
|
917
|
+
if task_config:
|
|
918
|
+
return task_config
|
|
919
|
+
|
|
920
|
+
# Fall back to overall best configuration
|
|
921
|
+
if 'steering_optimization' in config:
|
|
922
|
+
steering_opt = config['steering_optimization']
|
|
923
|
+
return {
|
|
924
|
+
'method': steering_opt.get('best_method'),
|
|
925
|
+
'layer': steering_opt.get('best_layer'),
|
|
926
|
+
'strength': steering_opt.get('best_strength')
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
return None
|
|
930
|
+
|
|
931
|
+
def evaluate_steering_effectiveness(
|
|
932
|
+
self,
|
|
933
|
+
task_name: str,
|
|
934
|
+
steering_method: SteeringMethod,
|
|
935
|
+
layer: int,
|
|
936
|
+
strength: float,
|
|
937
|
+
method_params: Dict[str, Any],
|
|
938
|
+
test_samples: List[Dict[str, Any]]
|
|
939
|
+
) -> Dict[str, float]:
|
|
940
|
+
"""
|
|
941
|
+
Evaluate how effectively steering changes model outputs.
|
|
942
|
+
|
|
943
|
+
Args:
|
|
944
|
+
task_name: Task being evaluated
|
|
945
|
+
steering_method: Steering method being used
|
|
946
|
+
layer: Steering layer
|
|
947
|
+
strength: Steering strength
|
|
948
|
+
method_params: Method-specific parameters
|
|
949
|
+
test_samples: Test samples to evaluate on
|
|
950
|
+
|
|
951
|
+
Returns:
|
|
952
|
+
Dictionary with effectiveness metrics
|
|
953
|
+
"""
|
|
954
|
+
# Use the internal evaluation method
|
|
955
|
+
score = self._evaluate_steering_configuration(
|
|
956
|
+
task_name=task_name,
|
|
957
|
+
method=steering_method,
|
|
958
|
+
layer=layer,
|
|
959
|
+
strength=strength,
|
|
960
|
+
limit=len(test_samples),
|
|
961
|
+
split_ratio=0.8
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
return {
|
|
965
|
+
'effectiveness_score': score,
|
|
966
|
+
'accuracy': score, # For now, use the same score
|
|
967
|
+
'consistency': 1.0 if score > 0.5 else 0.5,
|
|
968
|
+
'direction_accuracy': score
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
# Convenience functions for CLI integration
|
|
973
|
+
def run_steering_optimization(
|
|
974
|
+
model_name: str,
|
|
975
|
+
optimization_type: str = "auto",
|
|
976
|
+
task_name: str = None,
|
|
977
|
+
limit: int = 100,
|
|
978
|
+
device: str = None,
|
|
979
|
+
verbose: bool = False,
|
|
980
|
+
use_classification_config: bool = True,
|
|
981
|
+
**kwargs
|
|
982
|
+
) -> Union[SteeringOptimizationResult, SteeringOptimizationSummary, Dict[str, Any]]:
|
|
983
|
+
"""
|
|
984
|
+
Convenience function to run steering optimization.
|
|
985
|
+
|
|
986
|
+
Args:
|
|
987
|
+
model_name: Model to optimize steering for
|
|
988
|
+
optimization_type: Type of optimization ("auto", "method_comparison", "layer", "strength", "comprehensive")
|
|
989
|
+
task_name: Task to optimize for (if None and optimization_type="auto", uses all classification-optimized tasks)
|
|
990
|
+
limit: Sample limit
|
|
991
|
+
device: Device to use
|
|
992
|
+
verbose: Enable verbose logging
|
|
993
|
+
use_classification_config: Whether to use existing classification config as starting point
|
|
994
|
+
**kwargs: Additional arguments for specific optimization types
|
|
995
|
+
|
|
996
|
+
Returns:
|
|
997
|
+
SteeringOptimizationResult, SteeringOptimizationSummary, or auto-optimization results
|
|
998
|
+
"""
|
|
999
|
+
optimizer = SteeringOptimizer(
|
|
1000
|
+
model_name=model_name,
|
|
1001
|
+
device=device,
|
|
1002
|
+
verbose=verbose
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
if optimization_type == "auto":
|
|
1006
|
+
# Automatic optimization based on classification config
|
|
1007
|
+
return run_auto_steering_optimization(
|
|
1008
|
+
model_name=model_name,
|
|
1009
|
+
task_name=task_name,
|
|
1010
|
+
limit=limit,
|
|
1011
|
+
device=device,
|
|
1012
|
+
verbose=verbose,
|
|
1013
|
+
use_classification_config=use_classification_config,
|
|
1014
|
+
**kwargs
|
|
1015
|
+
)
|
|
1016
|
+
elif optimization_type == "method_comparison":
|
|
1017
|
+
if not task_name:
|
|
1018
|
+
raise ValueError("task_name required for method comparison")
|
|
1019
|
+
return optimizer.optimize_steering_method_comparison(
|
|
1020
|
+
task_name=task_name,
|
|
1021
|
+
limit=limit,
|
|
1022
|
+
**kwargs
|
|
1023
|
+
)
|
|
1024
|
+
elif optimization_type == "layer":
|
|
1025
|
+
if not task_name:
|
|
1026
|
+
raise ValueError("task_name required for layer optimization")
|
|
1027
|
+
|
|
1028
|
+
# Convert string steering_method to enum if needed
|
|
1029
|
+
if 'steering_method' in kwargs and isinstance(kwargs['steering_method'], str):
|
|
1030
|
+
kwargs['steering_method'] = SteeringMethod[kwargs['steering_method']]
|
|
1031
|
+
|
|
1032
|
+
return optimizer.optimize_steering_layer(
|
|
1033
|
+
task_name=task_name,
|
|
1034
|
+
limit=limit,
|
|
1035
|
+
**kwargs
|
|
1036
|
+
)
|
|
1037
|
+
elif optimization_type == "strength":
|
|
1038
|
+
if not task_name:
|
|
1039
|
+
raise ValueError("task_name required for strength optimization")
|
|
1040
|
+
|
|
1041
|
+
# Convert string steering_method to enum if needed
|
|
1042
|
+
if 'steering_method' in kwargs and isinstance(kwargs['steering_method'], str):
|
|
1043
|
+
kwargs['steering_method'] = SteeringMethod[kwargs['steering_method']]
|
|
1044
|
+
|
|
1045
|
+
return optimizer.optimize_steering_strength(
|
|
1046
|
+
task_name=task_name,
|
|
1047
|
+
limit=limit,
|
|
1048
|
+
**kwargs
|
|
1049
|
+
)
|
|
1050
|
+
elif optimization_type == "comprehensive":
|
|
1051
|
+
return optimizer.run_comprehensive_steering_optimization(
|
|
1052
|
+
limit=limit,
|
|
1053
|
+
**kwargs
|
|
1054
|
+
)
|
|
1055
|
+
else:
|
|
1056
|
+
raise ValueError(f"Unknown optimization type: {optimization_type}")
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def run_auto_steering_optimization(
|
|
1060
|
+
model_name: str,
|
|
1061
|
+
task_name: Optional[str] = None,
|
|
1062
|
+
limit: int = 100,
|
|
1063
|
+
device: str = None,
|
|
1064
|
+
verbose: bool = False,
|
|
1065
|
+
use_classification_config: bool = True,
|
|
1066
|
+
max_time_minutes: float = 60.0,
|
|
1067
|
+
methods_to_test: Optional[List[str]] = None,
|
|
1068
|
+
strength_range: Optional[List[float]] = None,
|
|
1069
|
+
layer_range: Optional[str] = None
|
|
1070
|
+
) -> Dict[str, Any]:
|
|
1071
|
+
"""
|
|
1072
|
+
Automatically optimize steering configuration.
|
|
1073
|
+
|
|
1074
|
+
This function can work either standalone or building on existing classification config.
|
|
1075
|
+
|
|
1076
|
+
Args:
|
|
1077
|
+
model_name: Model to optimize
|
|
1078
|
+
task_name: Specific task to optimize (required if no classification config)
|
|
1079
|
+
limit: Sample limit per evaluation
|
|
1080
|
+
device: Device to use
|
|
1081
|
+
verbose: Enable verbose logging
|
|
1082
|
+
use_classification_config: Use classification layer as starting point
|
|
1083
|
+
max_time_minutes: Maximum time for optimization
|
|
1084
|
+
methods_to_test: List of steering methods to test (defaults to ["CAA", "HPR"])
|
|
1085
|
+
strength_range: List of strengths to test (defaults to [0.5, 1.0, 1.5, 2.0])
|
|
1086
|
+
layer_range: Explicit layer range to search (e.g. "0-5" or "0,2,4")
|
|
1087
|
+
|
|
1088
|
+
Returns:
|
|
1089
|
+
Dictionary with optimization results and saved configuration paths
|
|
1090
|
+
"""
|
|
1091
|
+
optimizer = SteeringOptimizer(
|
|
1092
|
+
model_name=model_name,
|
|
1093
|
+
device=device,
|
|
1094
|
+
verbose=verbose
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
# Load classification config if requested
|
|
1098
|
+
config_manager = ModelConfigManager()
|
|
1099
|
+
classification_config = None
|
|
1100
|
+
if use_classification_config:
|
|
1101
|
+
classification_config = config_manager.load_model_config(model_name)
|
|
1102
|
+
if not classification_config:
|
|
1103
|
+
logger.info("ā¹ļø No classification config found, proceeding with standalone steering optimization")
|
|
1104
|
+
|
|
1105
|
+
# Determine tasks to optimize
|
|
1106
|
+
if task_name:
|
|
1107
|
+
tasks_to_optimize = [task_name]
|
|
1108
|
+
elif classification_config:
|
|
1109
|
+
# First try task-specific overrides
|
|
1110
|
+
if 'task_specific_overrides' in classification_config:
|
|
1111
|
+
tasks_to_optimize = list(classification_config['task_specific_overrides'].keys())
|
|
1112
|
+
|
|
1113
|
+
# If no task-specific overrides, check for tasks from optimization metrics
|
|
1114
|
+
if not tasks_to_optimize and 'optimization_metrics' in classification_config:
|
|
1115
|
+
# Try to get tasks from sample sizes
|
|
1116
|
+
if 'optimal_sample_sizes' in classification_config:
|
|
1117
|
+
tasks_to_optimize = list(classification_config['optimal_sample_sizes'].keys())
|
|
1118
|
+
|
|
1119
|
+
if not tasks_to_optimize:
|
|
1120
|
+
return {"error": "No task specified and no classification tasks found in config"}
|
|
1121
|
+
else:
|
|
1122
|
+
# Require explicit task name if no classification config
|
|
1123
|
+
return {"error": "Task name required when not using classification config"}
|
|
1124
|
+
|
|
1125
|
+
# Default methods and strengths
|
|
1126
|
+
if methods_to_test is None:
|
|
1127
|
+
# Use all default configurations
|
|
1128
|
+
method_configs = get_default_steering_configs()
|
|
1129
|
+
else:
|
|
1130
|
+
# Convert string methods to configs
|
|
1131
|
+
method_configs = []
|
|
1132
|
+
for method in methods_to_test:
|
|
1133
|
+
if method == "CAA":
|
|
1134
|
+
# Add both CAA variations
|
|
1135
|
+
method_configs.append(SteeringMethodConfig("CAA", SteeringMethod.CAA, {}))
|
|
1136
|
+
method_configs.append(SteeringMethodConfig("CAA_L2", SteeringMethod.CAA, {"normalization_method": "l2_unit"}))
|
|
1137
|
+
elif method == "HPR":
|
|
1138
|
+
# Add both HPR variations
|
|
1139
|
+
method_configs.append(SteeringMethodConfig("HPR", SteeringMethod.HPR, {"hpr_beta": 1.0}))
|
|
1140
|
+
method_configs.append(SteeringMethodConfig("HPR_Beta0.5", SteeringMethod.HPR, {"hpr_beta": 0.5}))
|
|
1141
|
+
elif method == "DAC":
|
|
1142
|
+
method_configs.append(SteeringMethodConfig("DAC", SteeringMethod.DAC, {"dac_dynamic_control": True, "dac_entropy_threshold": 1.0}))
|
|
1143
|
+
elif method == "BiPO":
|
|
1144
|
+
# Add both BiPO variations
|
|
1145
|
+
method_configs.append(SteeringMethodConfig("BiPO", SteeringMethod.BIPO, {"bipo_beta": 0.1, "bipo_epochs": 50}))
|
|
1146
|
+
method_configs.append(SteeringMethodConfig("BiPO_Beta0.05", SteeringMethod.BIPO, {"bipo_beta": 0.05, "bipo_epochs": 50}))
|
|
1147
|
+
elif method == "KSteering":
|
|
1148
|
+
# Add both KSteering variations
|
|
1149
|
+
method_configs.append(SteeringMethodConfig("KSteering", SteeringMethod.KSTEERING, {"ksteering_alpha": 5.0, "ksteering_target_labels": "0", "ksteering_avoid_labels": ""}))
|
|
1150
|
+
method_configs.append(SteeringMethodConfig("KSteering_Alpha3", SteeringMethod.KSTEERING, {"ksteering_alpha": 3.0, "ksteering_target_labels": "0", "ksteering_avoid_labels": ""}))
|
|
1151
|
+
else:
|
|
1152
|
+
logger.warning(f"Unknown steering method: {method}")
|
|
1153
|
+
|
|
1154
|
+
if strength_range is None:
|
|
1155
|
+
strength_range = [0.5, 1.0, 1.5, 2.0]
|
|
1156
|
+
|
|
1157
|
+
if verbose:
|
|
1158
|
+
logger.info(f"š Starting automatic steering optimization")
|
|
1159
|
+
logger.info(f" Model: {model_name}")
|
|
1160
|
+
logger.info(f" Tasks: {tasks_to_optimize}")
|
|
1161
|
+
logger.info(f" Method configurations: {[cfg.name for cfg in method_configs]}")
|
|
1162
|
+
logger.info(f" Time limit: {max_time_minutes} minutes")
|
|
1163
|
+
|
|
1164
|
+
results = {
|
|
1165
|
+
'model_name': model_name,
|
|
1166
|
+
'optimization_date': datetime.now().isoformat(),
|
|
1167
|
+
'tasks_optimized': [],
|
|
1168
|
+
'overall_best': None,
|
|
1169
|
+
'config_saved': False
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
# Optimize each task
|
|
1173
|
+
time_per_task = max_time_minutes / len(tasks_to_optimize)
|
|
1174
|
+
|
|
1175
|
+
for task in tasks_to_optimize:
|
|
1176
|
+
if verbose:
|
|
1177
|
+
logger.info(f"\nš Optimizing steering for task: {task}")
|
|
1178
|
+
|
|
1179
|
+
# Determine layer range
|
|
1180
|
+
task_layer_range = layer_range # Use provided layer range
|
|
1181
|
+
if not task_layer_range and classification_config and use_classification_config:
|
|
1182
|
+
# Only use classification config if no explicit layer range provided
|
|
1183
|
+
task_overrides = classification_config.get('task_specific_overrides', {}).get(task, {})
|
|
1184
|
+
class_layer = task_overrides.get('classification_layer')
|
|
1185
|
+
|
|
1186
|
+
if not class_layer:
|
|
1187
|
+
# Use global classification layer
|
|
1188
|
+
class_layer = classification_config.get('optimal_parameters', {}).get('classification_layer')
|
|
1189
|
+
|
|
1190
|
+
if class_layer:
|
|
1191
|
+
# Search around classification layer
|
|
1192
|
+
task_layer_range = f"{max(0, class_layer-2)}-{class_layer+2}"
|
|
1193
|
+
if verbose:
|
|
1194
|
+
logger.info(f" Using layer range around classification layer {class_layer}: {task_layer_range}")
|
|
1195
|
+
|
|
1196
|
+
# If still no layer range, use default based on model type
|
|
1197
|
+
if not task_layer_range:
|
|
1198
|
+
# Default to searching early to middle layers
|
|
1199
|
+
task_layer_range = "0-5"
|
|
1200
|
+
if verbose:
|
|
1201
|
+
logger.info(f" Using default layer range: {task_layer_range}")
|
|
1202
|
+
|
|
1203
|
+
# Run optimization for this task
|
|
1204
|
+
try:
|
|
1205
|
+
summary = optimizer.optimize_steering_method_comparison(
|
|
1206
|
+
task_name=task,
|
|
1207
|
+
methods_to_test=method_configs,
|
|
1208
|
+
layer_range=task_layer_range,
|
|
1209
|
+
strength_range=strength_range,
|
|
1210
|
+
limit=limit,
|
|
1211
|
+
max_time_minutes=time_per_task
|
|
1212
|
+
)
|
|
1213
|
+
|
|
1214
|
+
# Store results
|
|
1215
|
+
task_result = {
|
|
1216
|
+
'task': task,
|
|
1217
|
+
'best_method': summary.best_overall_method,
|
|
1218
|
+
'best_layer': summary.best_overall_layer,
|
|
1219
|
+
'best_strength': summary.best_overall_strength,
|
|
1220
|
+
'score': summary.task_results[0].steering_effectiveness_score if summary.task_results else 0.0
|
|
1221
|
+
}
|
|
1222
|
+
results['tasks_optimized'].append(task_result)
|
|
1223
|
+
|
|
1224
|
+
# Update overall best
|
|
1225
|
+
if not results['overall_best'] or task_result['score'] > results['overall_best']['score']:
|
|
1226
|
+
results['overall_best'] = task_result
|
|
1227
|
+
|
|
1228
|
+
except Exception as e:
|
|
1229
|
+
logger.error(f"ā Failed to optimize task {task}: {e}")
|
|
1230
|
+
results['tasks_optimized'].append({
|
|
1231
|
+
'task': task,
|
|
1232
|
+
'error': str(e)
|
|
1233
|
+
})
|
|
1234
|
+
|
|
1235
|
+
# Save configuration
|
|
1236
|
+
if results['tasks_optimized'] and not any('error' in r for r in results['tasks_optimized']):
|
|
1237
|
+
results['config_saved'] = True
|
|
1238
|
+
results['config_path'] = config_manager._get_config_path(model_name)
|
|
1239
|
+
|
|
1240
|
+
if verbose:
|
|
1241
|
+
logger.info(f"\nā
Steering optimization complete!")
|
|
1242
|
+
logger.info(f" Configuration saved to: {results['config_path']}")
|
|
1243
|
+
logger.info(f" Overall best: {results['overall_best']['best_method']} "
|
|
1244
|
+
f"L{results['overall_best']['best_layer']} "
|
|
1245
|
+
f"S{results['overall_best']['best_strength']}")
|
|
1246
|
+
|
|
1247
|
+
return results
|
|
1248
|
+
|
|
1249
|
+
|
|
1250
|
+
def get_optimal_steering_params(
|
|
1251
|
+
model_name: str,
|
|
1252
|
+
task_name: Optional[str] = None
|
|
1253
|
+
) -> Optional[Dict[str, Any]]:
|
|
1254
|
+
"""
|
|
1255
|
+
Get optimal steering parameters for a model/task.
|
|
1256
|
+
|
|
1257
|
+
Args:
|
|
1258
|
+
model_name: Model name
|
|
1259
|
+
task_name: Optional task name for task-specific params
|
|
1260
|
+
|
|
1261
|
+
Returns:
|
|
1262
|
+
Dictionary with steering parameters or None
|
|
1263
|
+
"""
|
|
1264
|
+
optimizer = SteeringOptimizer(model_name)
|
|
1265
|
+
return optimizer.load_optimal_steering_config(task_name)
|
|
1266
|
+
|
|
1267
|
+
|
|
1268
|
+
# TODO: Integration with existing steering methods
|
|
1269
|
+
#
|
|
1270
|
+
# The following integration points need to be implemented:
|
|
1271
|
+
#
|
|
1272
|
+
# 1. CAA Integration:
|
|
1273
|
+
# - Load existing CAA implementation from wisent_guard.core.steering_methods.caa
|
|
1274
|
+
# - Implement parameter optimization for CAA vectors
|
|
1275
|
+
# - Measure CAA steering effectiveness
|
|
1276
|
+
#
|
|
1277
|
+
# 2. HPR Integration:
|
|
1278
|
+
# - Load HPR implementation and optimize beta parameter
|
|
1279
|
+
# - Test rotation effectiveness across different layers
|
|
1280
|
+
#
|
|
1281
|
+
# 3. DAC Integration:
|
|
1282
|
+
# - Optimize dynamic control parameters and entropy thresholds
|
|
1283
|
+
# - Test adaptive steering strength adjustment
|
|
1284
|
+
#
|
|
1285
|
+
# 4. BiPO Integration:
|
|
1286
|
+
# - Optimize learning parameters for preference-based steering
|
|
1287
|
+
# - Implement bi-directional steering evaluation
|
|
1288
|
+
#
|
|
1289
|
+
# 5. K-Steering Integration:
|
|
1290
|
+
# - Optimize classifier parameters and label configurations
|
|
1291
|
+
# - Test multi-label steering effectiveness
|
|
1292
|
+
#
|
|
1293
|
+
# 6. Effectiveness Metrics:
|
|
1294
|
+
# - Implement steering strength measurement
|
|
1295
|
+
# - Develop steering direction accuracy metrics
|
|
1296
|
+
# - Create steering consistency evaluation
|
|
1297
|
+
# - Measure classification accuracy preservation
|