wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1297 @@
1
+ """
2
+ Steering Parameter Optimizer for Wisent-Guard.
3
+
4
+ Optimizes steering-specific parameters including:
5
+ 1. Optimal steering layer (may differ from classification layer)
6
+ 2. Optimal steering strength and dynamics
7
+ 3. Steering method selection and configuration
8
+ 4. Task-specific steering parameter tuning
9
+
10
+ This module builds on top of classification optimization to find optimal
11
+ steering configurations for each model and task.
12
+ """
13
+
14
+ import logging
15
+ import json
16
+ import time
17
+ import os
18
+ import numpy as np
19
+ from datetime import datetime
20
+ from typing import Dict, List, Tuple, Optional, Any, Union
21
+ from dataclasses import dataclass, asdict
22
+ from enum import Enum
23
+ from pathlib import Path
24
+
25
+ from .model_config_manager import ModelConfigManager
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def get_default_steering_configs() -> List['SteeringMethodConfig']:
31
+ """Get default steering method configurations with parameter variations."""
32
+ return [
33
+ # CAA variations
34
+ SteeringMethodConfig(
35
+ name="CAA",
36
+ method=SteeringMethod.CAA,
37
+ params={}
38
+ ),
39
+ SteeringMethodConfig(
40
+ name="CAA_L2",
41
+ method=SteeringMethod.CAA,
42
+ params={"normalization_method": "l2_unit"}
43
+ ),
44
+
45
+ # HPR variations
46
+ SteeringMethodConfig(
47
+ name="HPR",
48
+ method=SteeringMethod.HPR,
49
+ params={"hpr_beta": 1.0}
50
+ ),
51
+ SteeringMethodConfig(
52
+ name="HPR_Beta0.5",
53
+ method=SteeringMethod.HPR,
54
+ params={"hpr_beta": 0.5}
55
+ ),
56
+
57
+ # BiPO variations
58
+ SteeringMethodConfig(
59
+ name="BiPO",
60
+ method=SteeringMethod.BIPO,
61
+ params={"bipo_beta": 0.1, "bipo_epochs": 50}
62
+ ),
63
+ SteeringMethodConfig(
64
+ name="BiPO_Beta0.05",
65
+ method=SteeringMethod.BIPO,
66
+ params={"bipo_beta": 0.05, "bipo_epochs": 50}
67
+ ),
68
+
69
+ # KSteering variations
70
+ SteeringMethodConfig(
71
+ name="KSteering",
72
+ method=SteeringMethod.KSTEERING,
73
+ params={
74
+ "ksteering_alpha": 5.0,
75
+ "ksteering_target_labels": "0",
76
+ "ksteering_avoid_labels": ""
77
+ }
78
+ ),
79
+ SteeringMethodConfig(
80
+ name="KSteering_Alpha3",
81
+ method=SteeringMethod.KSTEERING,
82
+ params={
83
+ "ksteering_alpha": 3.0,
84
+ "ksteering_target_labels": "0",
85
+ "ksteering_avoid_labels": ""
86
+ }
87
+ ),
88
+
89
+ # DAC variations
90
+ SteeringMethodConfig(
91
+ name="DAC",
92
+ method=SteeringMethod.DAC,
93
+ params={
94
+ "dac_dynamic_control": True,
95
+ "dac_entropy_threshold": 1.0
96
+ }
97
+ ),
98
+ ]
99
+
100
+
101
+ class SteeringMethod(Enum):
102
+ """Available steering methods for optimization."""
103
+ CAA = "CAA"
104
+ HPR = "HPR"
105
+ DAC = "DAC"
106
+ BIPO = "BiPO"
107
+ KSTEERING = "KSteering"
108
+
109
+
110
+ @dataclass
111
+ class SteeringMethodConfig:
112
+ """Configuration for a specific steering method with parameter variations."""
113
+ name: str # Display name like "CAA_L2"
114
+ method: SteeringMethod
115
+ params: Dict[str, Any] # Method-specific parameters
116
+
117
+ def __post_init__(self):
118
+ """Ensure method is SteeringMethod enum."""
119
+ if isinstance(self.method, str):
120
+ self.method = SteeringMethod(self.method)
121
+
122
+
123
+ @dataclass
124
+ class SteeringOptimizationResult:
125
+ """Results from optimizing steering parameters for a single task."""
126
+ task_name: str
127
+ best_steering_layer: int
128
+ best_steering_method: str
129
+ best_steering_strength: float
130
+ optimal_parameters: Dict[str, Any] # Method-specific parameters
131
+ steering_effectiveness_score: float # How well steering changes outputs
132
+ classification_accuracy_impact: float # Impact on classification performance
133
+ optimization_time_seconds: float
134
+ total_configurations_tested: int
135
+ error_message: Optional[str] = None
136
+
137
+
138
+ @dataclass
139
+ class SteeringOptimizationSummary:
140
+ """Summary of steering optimization across tasks/methods."""
141
+ model_name: str
142
+ optimization_type: str # "single_task", "multi_task", "method_comparison"
143
+ total_configurations_tested: int
144
+ optimization_time_minutes: float
145
+ best_overall_method: str
146
+ best_overall_layer: int
147
+ best_overall_strength: float
148
+ method_performance_ranking: Dict[str, float] # method -> effectiveness score
149
+ layer_effectiveness_analysis: Dict[int, float] # layer -> avg effectiveness
150
+ task_results: List[SteeringOptimizationResult]
151
+ optimization_date: str
152
+
153
+
154
+ class SteeringOptimizer:
155
+ """
156
+ Framework for optimizing steering parameters.
157
+
158
+ This class provides the structure for steering optimization but requires
159
+ implementation of the actual optimization algorithms for each steering method.
160
+ """
161
+
162
+ def __init__(self, model_name: str, device: str = None, verbose: bool = False):
163
+ """
164
+ Initialize steering optimizer.
165
+
166
+ Args:
167
+ model_name: Name/path of the model to optimize steering for
168
+ device: Device to run optimization on
169
+ verbose: Enable verbose logging
170
+ """
171
+ self.model_name = model_name
172
+ self.device = device
173
+ self.verbose = verbose
174
+ self.config_manager = ModelConfigManager()
175
+
176
+ # Load classification parameters if available (steering often builds on classification)
177
+ self.classification_config = self.config_manager.load_model_config(model_name)
178
+ if self.classification_config:
179
+ self.base_classification_layer = self.classification_config.get("optimal_parameters", {}).get("classification_layer")
180
+ logger.info(f"šŸ“Š Found existing classification layer: {self.base_classification_layer}")
181
+ else:
182
+ self.base_classification_layer = None
183
+ logger.warning("āš ļø No existing classification configuration found")
184
+
185
+ def optimize_steering_method_comparison(
186
+ self,
187
+ task_name: str,
188
+ methods_to_test: Optional[Union[List[SteeringMethod], List[SteeringMethodConfig]]] = None,
189
+ layer_range: Optional[str] = None,
190
+ strength_range: Optional[List[float]] = None,
191
+ limit: int = 100,
192
+ max_time_minutes: float = 30.0,
193
+ split_ratio: float = 0.8
194
+ ) -> SteeringOptimizationSummary:
195
+ """
196
+ Compare different steering methods to find the best one for a task.
197
+
198
+ Args:
199
+ task_name: Task to optimize steering for
200
+ methods_to_test: List of steering methods to compare
201
+ layer_range: Range of layers to test for steering
202
+ strength_range: Range of steering strengths to test
203
+ limit: Maximum samples for testing
204
+ max_time_minutes: Maximum optimization time
205
+ split_ratio: Train/test split ratio
206
+
207
+ Returns:
208
+ SteeringOptimizationSummary with method comparison results
209
+ """
210
+ # Handle both old-style method list and new config list
211
+ if methods_to_test is None:
212
+ # Use all default configurations
213
+ method_configs = get_default_steering_configs()
214
+ else:
215
+ method_configs = []
216
+ for item in methods_to_test:
217
+ if isinstance(item, SteeringMethodConfig):
218
+ method_configs.append(item)
219
+ elif isinstance(item, SteeringMethod):
220
+ # Convert simple method to config with default params
221
+ method_configs.append(SteeringMethodConfig(
222
+ name=item.value,
223
+ method=item,
224
+ params={}
225
+ ))
226
+ elif isinstance(item, str):
227
+ # Convert string to SteeringMethod enum
228
+ try:
229
+ method = SteeringMethod(item)
230
+ method_configs.append(SteeringMethodConfig(
231
+ name=method.value,
232
+ method=method,
233
+ params={}
234
+ ))
235
+ except ValueError:
236
+ logger.warning(f"Unknown steering method: {item}")
237
+ else:
238
+ logger.warning(f"Unknown method type: {type(item)}, value: {item}")
239
+
240
+ if strength_range is None:
241
+ strength_range = [0.5, 1.0, 1.5, 2.0]
242
+
243
+ logger.info(f"šŸŽÆ Comparing {len(method_configs)} steering method configurations for task: {task_name}")
244
+
245
+ start_time = time.time()
246
+ task_results = []
247
+ all_results = {}
248
+
249
+ # Determine layer search range
250
+ if layer_range:
251
+ layers_to_test = self._parse_layer_range(layer_range)
252
+ elif self.base_classification_layer:
253
+ # Search around classification layer
254
+ min_layer = max(1, self.base_classification_layer - 2)
255
+ max_layer = min(32, self.base_classification_layer + 2) # Assume max 32 layers
256
+ layers_to_test = list(range(min_layer, max_layer + 1))
257
+ else:
258
+ # Default range for common models
259
+ layers_to_test = [10, 12, 14, 16, 18, 20]
260
+
261
+ configurations_tested = 0
262
+ best_overall_score = 0.0
263
+ best_overall_config = None
264
+
265
+ # Test each method configuration
266
+ for method_config in method_configs:
267
+ method_results = []
268
+
269
+ for layer in layers_to_test:
270
+ for strength in strength_range:
271
+ if time.time() - start_time > max_time_minutes * 60:
272
+ logger.warning(f"ā° Time limit reached, stopping optimization")
273
+ break
274
+
275
+ try:
276
+ # Run evaluation for this configuration
277
+ score = self._evaluate_steering_configuration(
278
+ task_name=task_name,
279
+ method=method_config.method,
280
+ layer=layer,
281
+ strength=strength,
282
+ limit=limit,
283
+ split_ratio=split_ratio,
284
+ method_params=method_config.params
285
+ )
286
+
287
+ configurations_tested += 1
288
+ config_result = {
289
+ 'method': method_config.name, # Use display name
290
+ 'method_type': method_config.method.value,
291
+ 'layer': layer,
292
+ 'strength': strength,
293
+ 'score': score,
294
+ 'params': method_config.params
295
+ }
296
+ method_results.append(config_result)
297
+
298
+ if score > best_overall_score:
299
+ best_overall_score = score
300
+ best_overall_config = config_result
301
+
302
+ if self.verbose:
303
+ logger.info(f" {method_config.name} L{layer} S{strength}: {score:.3f}")
304
+
305
+ except Exception as e:
306
+ logger.error(f" Error testing {method_config.name} L{layer} S{strength}: {e}")
307
+
308
+ all_results[method_config.name] = method_results
309
+
310
+ # Analyze results
311
+ method_performance = {}
312
+ layer_effectiveness = {}
313
+
314
+ for method, results in all_results.items():
315
+ if results:
316
+ scores = [r['score'] for r in results]
317
+ method_performance[method] = max(scores)
318
+
319
+ # Aggregate by layer
320
+ for result in results:
321
+ layer = result['layer']
322
+ if layer not in layer_effectiveness:
323
+ layer_effectiveness[layer] = []
324
+ layer_effectiveness[layer].append(result['score'])
325
+
326
+ # Average layer effectiveness
327
+ for layer in layer_effectiveness:
328
+ layer_effectiveness[layer] = sum(layer_effectiveness[layer]) / len(layer_effectiveness[layer])
329
+
330
+ # Create optimization result
331
+ optimization_time = time.time() - start_time
332
+
333
+ if best_overall_config:
334
+ result = SteeringOptimizationResult(
335
+ task_name=task_name,
336
+ best_steering_layer=best_overall_config['layer'],
337
+ best_steering_method=best_overall_config['method'],
338
+ best_steering_strength=best_overall_config['strength'],
339
+ optimal_parameters={
340
+ 'split_ratio': split_ratio,
341
+ 'limit': limit,
342
+ **best_overall_config.get('params', {})
343
+ },
344
+ steering_effectiveness_score=best_overall_config['score'],
345
+ classification_accuracy_impact=0.0, # Not measured here
346
+ optimization_time_seconds=optimization_time,
347
+ total_configurations_tested=configurations_tested
348
+ )
349
+ task_results.append(result)
350
+
351
+ summary = SteeringOptimizationSummary(
352
+ model_name=self.model_name,
353
+ optimization_type="method_comparison",
354
+ total_configurations_tested=configurations_tested,
355
+ optimization_time_minutes=optimization_time / 60,
356
+ best_overall_method=best_overall_config['method'] if best_overall_config else "none",
357
+ best_overall_layer=best_overall_config['layer'] if best_overall_config else 0,
358
+ best_overall_strength=best_overall_config['strength'] if best_overall_config else 0.0,
359
+ method_performance_ranking=method_performance,
360
+ layer_effectiveness_analysis=layer_effectiveness,
361
+ task_results=task_results,
362
+ optimization_date=datetime.now().isoformat()
363
+ )
364
+
365
+ # Save the results
366
+ self._save_steering_optimization_results(summary)
367
+
368
+ return summary
369
+
370
+ def optimize_steering_layer(
371
+ self,
372
+ task_name: str,
373
+ steering_method: SteeringMethod = SteeringMethod.CAA,
374
+ layer_search_range: Optional[Tuple[int, int]] = None,
375
+ strength: float = 1.0,
376
+ limit: int = 100
377
+ ) -> SteeringOptimizationResult:
378
+ """
379
+ Find optimal steering layer for a specific method and task.
380
+
381
+ Args:
382
+ task_name: Task to optimize for
383
+ steering_method: Steering method to use
384
+ layer_search_range: (min_layer, max_layer) to search
385
+ strength: Fixed steering strength to use during layer search
386
+ limit: Maximum samples for testing
387
+
388
+ Returns:
389
+ SteeringOptimizationResult with optimal layer
390
+ """
391
+ logger.info(f"šŸ” Optimizing steering layer for {task_name} using {steering_method.value}")
392
+
393
+ if layer_search_range is None:
394
+ # Default: search around classification layer if available
395
+ if self.base_classification_layer:
396
+ min_layer = max(1, self.base_classification_layer - 3)
397
+ max_layer = self.base_classification_layer + 3
398
+ layer_search_range = (min_layer, max_layer)
399
+ else:
400
+ # TODO: Auto-detect model layer count and use reasonable range
401
+ layer_search_range = (10, 20) # Default fallback
402
+
403
+ # TODO: Implement layer optimization logic
404
+ raise NotImplementedError(
405
+ "Steering layer optimization not yet implemented. "
406
+ "This requires implementing steering vector training and "
407
+ "effectiveness measurement across different layers."
408
+ )
409
+
410
+ def optimize_steering_strength(
411
+ self,
412
+ task_name: str,
413
+ steering_method: SteeringMethod = SteeringMethod.CAA,
414
+ layer: Optional[int] = None,
415
+ strength_range: Optional[Tuple[float, float]] = None,
416
+ strength_steps: int = 10,
417
+ limit: int = 100,
418
+ method_params: Optional[Dict[str, Any]] = None
419
+ ) -> SteeringOptimizationResult:
420
+ """
421
+ Find optimal steering strength for a specific method, layer, and task.
422
+
423
+ Args:
424
+ task_name: Task to optimize for
425
+ steering_method: Steering method to use
426
+ layer: Steering layer to use (defaults to classification layer)
427
+ strength_range: (min_strength, max_strength) to search
428
+ strength_steps: Number of strength values to test
429
+ limit: Maximum samples for testing
430
+
431
+ Returns:
432
+ SteeringOptimizationResult with optimal strength
433
+ """
434
+ if layer is None:
435
+ layer = self.base_classification_layer or 15 # Default fallback
436
+
437
+ if strength_range is None:
438
+ strength_range = (0.1, 2.0) # Default strength range
439
+
440
+ logger.info(f"⚔ Optimizing steering strength for {task_name}")
441
+ logger.info(f" Method: {steering_method.value}, Layer: {layer}")
442
+ logger.info(f" Strength range: {strength_range}, Steps: {strength_steps}")
443
+
444
+ # Load steering parameters from config
445
+ import json
446
+ import os
447
+ config_path = os.path.join(os.path.dirname(__file__), '..', '..', 'steering_optimization_parameters.json')
448
+ if os.path.exists(config_path):
449
+ with open(config_path, 'r') as f:
450
+ steering_config = json.load(f)
451
+ else:
452
+ steering_config = {}
453
+
454
+ # Get default layer if not provided
455
+ if layer is None:
456
+ layer = self._get_classification_layer(task_name)
457
+ logger.info(f" Using classification layer: {layer}")
458
+
459
+ # Default strength range from config or fallback
460
+ if strength_range is None:
461
+ default_strengths = steering_config.get('steering_strengths', {}).get('default', [0.5, 1.0, 1.5, 2.0])
462
+ strength_range = (min(default_strengths), max(default_strengths))
463
+
464
+ # Generate strength values to test
465
+ import numpy as np
466
+ strengths = np.linspace(strength_range[0], strength_range[1], strength_steps)
467
+
468
+ # Get method-specific parameters from config if not provided
469
+ if method_params is None:
470
+ method_configs = steering_config.get('steering_methods', [])
471
+ for config in method_configs:
472
+ if config['method'] == steering_method.value and config['name'] == steering_method.value:
473
+ method_params = config.get('params', {})
474
+ break
475
+ if method_params is None:
476
+ method_params = {}
477
+
478
+ results = []
479
+ best_score = -float('inf')
480
+ best_strength = 0.0
481
+
482
+ logger.info(f" Testing {len(strengths)} strength values...")
483
+
484
+ # Test each strength value
485
+ for strength in strengths:
486
+ try:
487
+ # Run evaluation with this strength
488
+ from ..cli import run_task_pipeline
489
+
490
+ # Build kwargs for run_task_pipeline
491
+ pipeline_kwargs = {
492
+ 'task_name': task_name,
493
+ 'model_name': self.model_name,
494
+ 'limit': limit,
495
+ 'device': self.device,
496
+ 'verbose': False,
497
+ 'steering_mode': True,
498
+ 'steering_method': steering_method.value,
499
+ 'steering_strength': float(strength),
500
+ 'layer': str(layer),
501
+ 'output_mode': "likelihoods", # Get likelihoods for evaluation
502
+ 'allow_small_dataset': True
503
+ }
504
+
505
+ # Add method-specific parameters
506
+ for param, value in method_params.items():
507
+ pipeline_kwargs[param] = value
508
+
509
+ result = run_task_pipeline(**pipeline_kwargs)
510
+
511
+ # Extract score from evaluation results
512
+ score = 0.0
513
+ steering_effect = 0.0
514
+ accuracy = 0.0
515
+
516
+ if isinstance(result, dict):
517
+ # Get evaluation results - try both nested and direct access
518
+ eval_results = None
519
+
520
+ # First try: result[task_name]['evaluation_results']
521
+ task_result = result.get(task_name, {})
522
+ if 'evaluation_results' in task_result:
523
+ eval_results = task_result['evaluation_results']
524
+ # Second try: result['evaluation_results'] (direct from run_task_pipeline)
525
+ elif 'evaluation_results' in result:
526
+ eval_results = result['evaluation_results']
527
+ else:
528
+ eval_results = {}
529
+
530
+ # Get accuracy score (but don't use it as the primary metric)
531
+ accuracy = eval_results.get('accuracy', 0.0)
532
+ if isinstance(accuracy, str) or accuracy is None:
533
+ accuracy = 0.0
534
+
535
+ # Calculate steering effect from likelihood changes
536
+ baseline_likes = eval_results.get('baseline_likelihoods', [])
537
+ steered_likes = eval_results.get('steered_likelihoods', [])
538
+
539
+ if self.verbose:
540
+ logger.debug(f" Found {len(baseline_likes)} baseline and {len(steered_likes)} steered likelihoods")
541
+ if baseline_likes and len(baseline_likes) > 0:
542
+ logger.debug(f" First few baseline likes: {baseline_likes[:3]}")
543
+ logger.debug(f" First few steered likes: {steered_likes[:3]}")
544
+ logger.debug(f" Full eval_results keys: {list(eval_results.keys())}")
545
+ logger.debug(f" Accuracy value: {eval_results.get('accuracy')}")
546
+
547
+ # Check if we're getting the right data structure
548
+ if isinstance(result, dict):
549
+ logger.debug(f" Result keys: {list(result.keys())}")
550
+ if 'evaluation_results' in result:
551
+ logger.debug(f" Direct evaluation_results found")
552
+
553
+ if baseline_likes and steered_likes:
554
+ # Filter out inf and nan values
555
+ valid_pairs = []
556
+ for b, s in zip(baseline_likes, steered_likes):
557
+ if np.isfinite(b) and np.isfinite(s):
558
+ valid_pairs.append((b, s))
559
+
560
+ if valid_pairs:
561
+ changes = [abs(s - b) for b, s in valid_pairs]
562
+ steering_effect = sum(changes) / len(changes) if changes else 0.0
563
+
564
+ # Cap steering effect to prevent infinity
565
+ steering_effect = min(steering_effect, 100.0)
566
+
567
+ # Also calculate how many preferences changed
568
+ preference_changes = 0
569
+ for i in range(0, len(baseline_likes), 2): # Assuming binary choices
570
+ if i+1 < len(baseline_likes):
571
+ if np.isfinite(baseline_likes[i]) and np.isfinite(baseline_likes[i+1]) and \
572
+ np.isfinite(steered_likes[i]) and np.isfinite(steered_likes[i+1]):
573
+ baseline_pref = 0 if baseline_likes[i] > baseline_likes[i+1] else 1
574
+ steered_pref = 0 if steered_likes[i] > steered_likes[i+1] else 1
575
+ if baseline_pref != steered_pref:
576
+ preference_changes += 1
577
+
578
+ # Use steering effect as the primary score
579
+ score = steering_effect
580
+
581
+ # Add bonus if accuracy is valid and good
582
+ if np.isfinite(accuracy) and accuracy > 0.5:
583
+ score += accuracy * 0.5
584
+ else:
585
+ # No valid likelihood pairs
586
+ score = 0.0
587
+ steering_effect = 0.0
588
+ else:
589
+ # Fallback to accuracy if no likelihood data
590
+ score = accuracy if np.isfinite(accuracy) else 0.0
591
+
592
+ results.append({
593
+ 'strength': float(strength),
594
+ 'score': score,
595
+ 'steering_effect': steering_effect,
596
+ 'evaluation_results': eval_results if isinstance(result, dict) else {}
597
+ })
598
+
599
+ if score > best_score:
600
+ best_score = score
601
+ best_strength = float(strength)
602
+
603
+ logger.info(f" Strength {strength:.2f}: score={score:.3f}, effect={steering_effect:.3f}, accuracy={accuracy:.3f}")
604
+
605
+ except Exception as e:
606
+ logger.error(f" Error testing strength {strength}: {e}")
607
+ results.append({
608
+ 'strength': float(strength),
609
+ 'score': 0.0,
610
+ 'error': str(e)
611
+ })
612
+
613
+ return SteeringOptimizationResult(
614
+ task_name=task_name,
615
+ best_steering_layer=layer,
616
+ best_steering_method=steering_method.value,
617
+ best_steering_strength=best_strength,
618
+ optimal_parameters={'strength': best_strength},
619
+ steering_effectiveness_score=best_score,
620
+ classification_accuracy_impact=best_score, # Using same score for now
621
+ optimization_time_seconds=0.0, # TODO: Track actual time
622
+ total_configurations_tested=len(results),
623
+ error_message=None
624
+ )
625
+
626
+ def optimize_method_specific_parameters(
627
+ self,
628
+ task_name: str,
629
+ steering_method: SteeringMethod,
630
+ base_layer: Optional[int] = None,
631
+ base_strength: float = 1.0,
632
+ limit: int = 100
633
+ ) -> SteeringOptimizationResult:
634
+ """
635
+ Optimize method-specific parameters for a steering approach.
636
+
637
+ Args:
638
+ task_name: Task to optimize for
639
+ steering_method: Specific steering method to optimize
640
+ base_layer: Base steering layer to use
641
+ base_strength: Base steering strength to use
642
+ limit: Maximum samples for testing
643
+
644
+ Returns:
645
+ SteeringOptimizationResult with optimized method parameters
646
+ """
647
+ logger.info(f"šŸ”§ Optimizing {steering_method.value}-specific parameters for {task_name}")
648
+
649
+ if steering_method == SteeringMethod.CAA:
650
+ return self._optimize_caa_parameters(task_name, base_layer, base_strength, limit)
651
+ elif steering_method == SteeringMethod.HPR:
652
+ return self._optimize_hpr_parameters(task_name, base_layer, base_strength, limit)
653
+ elif steering_method == SteeringMethod.DAC:
654
+ return self._optimize_dac_parameters(task_name, base_layer, base_strength, limit)
655
+ elif steering_method == SteeringMethod.BIPO:
656
+ return self._optimize_bipo_parameters(task_name, base_layer, base_strength, limit)
657
+ elif steering_method == SteeringMethod.KSTEERING:
658
+ return self._optimize_ksteering_parameters(task_name, base_layer, base_strength, limit)
659
+ else:
660
+ raise ValueError(f"Unknown steering method: {steering_method}")
661
+
662
+ def _optimize_caa_parameters(
663
+ self,
664
+ task_name: str,
665
+ layer: Optional[int],
666
+ strength: float,
667
+ limit: int
668
+ ) -> SteeringOptimizationResult:
669
+ """Optimize CAA (Concept Activation Analysis) specific parameters."""
670
+ # TODO: Implement CAA parameter optimization
671
+ # CAA typically doesn't have many hyperparameters beyond layer/strength
672
+ # but may include normalization options, vector aggregation methods, etc.
673
+ raise NotImplementedError("CAA parameter optimization not yet implemented")
674
+
675
+ def _optimize_hpr_parameters(
676
+ self,
677
+ task_name: str,
678
+ layer: Optional[int],
679
+ strength: float,
680
+ limit: int
681
+ ) -> SteeringOptimizationResult:
682
+ """Optimize HPR (Householder Pseudo-Rotation) specific parameters."""
683
+ # TODO: Implement HPR parameter optimization
684
+ # HPR has beta parameter and potentially rotation-specific settings
685
+ raise NotImplementedError("HPR parameter optimization not yet implemented")
686
+
687
+ def _optimize_dac_parameters(
688
+ self,
689
+ task_name: str,
690
+ layer: Optional[int],
691
+ strength: float,
692
+ limit: int
693
+ ) -> SteeringOptimizationResult:
694
+ """Optimize DAC (Dynamic Activation Composition) specific parameters."""
695
+ # TODO: Implement DAC parameter optimization
696
+ # DAC has dynamic control settings, entropy thresholds, etc.
697
+ raise NotImplementedError("DAC parameter optimization not yet implemented")
698
+
699
+ def _optimize_bipo_parameters(
700
+ self,
701
+ task_name: str,
702
+ layer: Optional[int],
703
+ strength: float,
704
+ limit: int
705
+ ) -> SteeringOptimizationResult:
706
+ """Optimize BiPO (Bi-directional Preference Optimization) specific parameters."""
707
+ # TODO: Implement BiPO parameter optimization
708
+ # BiPO has learning rate, beta, epochs, and other training-specific parameters
709
+ raise NotImplementedError("BiPO parameter optimization not yet implemented")
710
+
711
+ def _optimize_ksteering_parameters(
712
+ self,
713
+ task_name: str,
714
+ layer: Optional[int],
715
+ strength: float,
716
+ limit: int
717
+ ) -> SteeringOptimizationResult:
718
+ """Optimize K-Steering specific parameters."""
719
+ # TODO: Implement K-Steering parameter optimization
720
+ # K-Steering has many parameters: num_labels, hidden_dim, learning_rate,
721
+ # classifier_epochs, target/avoid labels, alpha, etc.
722
+ raise NotImplementedError("K-Steering parameter optimization not yet implemented")
723
+
724
+ def run_comprehensive_steering_optimization(
725
+ self,
726
+ tasks: Optional[List[str]] = None,
727
+ methods: Optional[List[SteeringMethod]] = None,
728
+ limit: int = 100,
729
+ max_time_per_task_minutes: float = 20.0,
730
+ save_results: bool = True
731
+ ) -> SteeringOptimizationSummary:
732
+ """
733
+ Run comprehensive steering optimization across multiple tasks and methods.
734
+
735
+ Args:
736
+ tasks: List of tasks to optimize (if None, uses classification-optimized tasks)
737
+ methods: List of steering methods to test
738
+ limit: Sample limit per task
739
+ max_time_per_task_minutes: Time limit per task
740
+ save_results: Whether to save results to config
741
+
742
+ Returns:
743
+ SteeringOptimizationSummary with comprehensive results
744
+ """
745
+ logger.info(f"šŸš€ Starting comprehensive steering optimization")
746
+
747
+ if tasks is None:
748
+ # Use tasks that were successfully optimized for classification
749
+ if self.classification_config:
750
+ task_overrides = self.classification_config.get("task_specific_overrides", {})
751
+ tasks = list(task_overrides.keys())
752
+ if not tasks:
753
+ logger.warning("No classification-optimized tasks found, using default task set")
754
+ tasks = ["truthfulqa_mc1", "gsm8k", "squad2"] # Default fallback
755
+ else:
756
+ tasks = ["truthfulqa_mc1", "gsm8k", "squad2"] # Default fallback
757
+
758
+ if methods is None:
759
+ methods = [SteeringMethod.CAA, SteeringMethod.HPR] # Start with simpler methods
760
+
761
+ logger.info(f"šŸ“Š Tasks: {tasks}")
762
+ logger.info(f"šŸ”§ Methods: [methods.value for method in methods]")
763
+
764
+ # TODO: Implement comprehensive optimization loop
765
+ # This should:
766
+ # 1. For each task and method combination
767
+ # 2. Find optimal layer, strength, and method-specific parameters
768
+ # 3. Measure steering effectiveness vs classification accuracy tradeoff
769
+ # 4. Aggregate results and find best overall parameters
770
+ # 5. Save task-specific steering configurations
771
+
772
+ raise NotImplementedError(
773
+ "Comprehensive steering optimization not yet implemented. "
774
+ "This requires implementing all the individual optimization methods "
775
+ "and result aggregation logic."
776
+ )
777
+
778
+ def _parse_layer_range(self, layer_range: str) -> List[int]:
779
+ """Parse layer range string like '10-20' or '10,12,14'."""
780
+ if '-' in layer_range:
781
+ start, end = map(int, layer_range.split('-'))
782
+ return list(range(start, end + 1))
783
+ elif ',' in layer_range:
784
+ return [int(x.strip()) for x in layer_range.split(',')]
785
+ else:
786
+ return [int(layer_range)]
787
+
788
+ def _evaluate_steering_configuration(
789
+ self,
790
+ task_name: str,
791
+ method: SteeringMethod,
792
+ layer: int,
793
+ strength: float,
794
+ limit: int,
795
+ split_ratio: float,
796
+ method_params: Optional[Dict[str, Any]] = None
797
+ ) -> float:
798
+ """
799
+ Evaluate a single steering configuration and return its effectiveness score.
800
+
801
+ Args:
802
+ method_params: Additional method-specific parameters
803
+
804
+ Returns:
805
+ Effectiveness score (0.0 to 1.0)
806
+ """
807
+ try:
808
+ # Import CLI runner to test configuration
809
+ from wisent.cli import run_task_pipeline
810
+
811
+ # Prepare kwargs with method-specific parameters
812
+ kwargs = {
813
+ 'task_name': task_name,
814
+ 'model_name': self.model_name,
815
+ 'layer': str(layer),
816
+ 'limit': limit,
817
+ 'steering_mode': True,
818
+ 'steering_method': method.value,
819
+ 'steering_strength': strength,
820
+ 'split_ratio': split_ratio,
821
+ 'device': self.device,
822
+ 'verbose': False,
823
+ 'allow_small_dataset': True
824
+ }
825
+
826
+ # Add method-specific parameters
827
+ if method_params:
828
+ # Map parameter names to CLI argument names
829
+ param_mapping = {
830
+ 'normalization_method': 'normalization_method',
831
+ 'hpr_beta': 'hpr_beta',
832
+ 'dac_dynamic_control': 'dac_dynamic_control',
833
+ 'dac_entropy_threshold': 'dac_entropy_threshold',
834
+ 'bipo_beta': 'bipo_beta',
835
+ 'bipo_epochs': 'bipo_epochs',
836
+ 'ksteering_alpha': 'ksteering_alpha',
837
+ 'ksteering_target_labels': 'ksteering_target_labels',
838
+ 'ksteering_avoid_labels': 'ksteering_avoid_labels'
839
+ }
840
+
841
+ for param_key, param_value in method_params.items():
842
+ if param_key in param_mapping:
843
+ kwargs[param_mapping[param_key]] = param_value
844
+
845
+ # Run steering evaluation
846
+ result = run_task_pipeline(**kwargs)
847
+
848
+ # Extract evaluation score
849
+ # Priority: accuracy > likelihood change > 0.0
850
+ if 'accuracy' in result and result['accuracy'] != 'N/A':
851
+ return float(result['accuracy'])
852
+ elif 'evaluation_results' in result:
853
+ eval_results = result['evaluation_results']
854
+ if 'accuracy' in eval_results and eval_results['accuracy'] != 'N/A':
855
+ return float(eval_results['accuracy'])
856
+ # Could also use likelihood changes as a metric
857
+
858
+ return 0.0
859
+
860
+ except Exception as e:
861
+ logger.error(f"Configuration evaluation failed: {e}")
862
+ return 0.0
863
+
864
+ def _save_steering_optimization_results(self, summary: SteeringOptimizationSummary):
865
+ """Save optimization results to configuration."""
866
+ config = self.config_manager.load_model_config(self.model_name) or {
867
+ 'model_name': self.model_name,
868
+ 'created_date': datetime.now().isoformat(),
869
+ 'config_version': '2.0'
870
+ }
871
+
872
+ # Add steering optimization results
873
+ if 'steering_optimization' not in config:
874
+ config['steering_optimization'] = {}
875
+
876
+ # Save overall best configuration
877
+ config['steering_optimization']['best_method'] = summary.best_overall_method
878
+ config['steering_optimization']['best_layer'] = summary.best_overall_layer
879
+ config['steering_optimization']['best_strength'] = summary.best_overall_strength
880
+ config['steering_optimization']['optimization_date'] = summary.optimization_date
881
+ config['steering_optimization']['method_ranking'] = summary.method_performance_ranking
882
+
883
+ # Save task-specific results
884
+ if 'task_specific_steering' not in config:
885
+ config['task_specific_steering'] = {}
886
+
887
+ for task_result in summary.task_results:
888
+ config['task_specific_steering'][task_result.task_name] = {
889
+ 'method': task_result.best_steering_method,
890
+ 'layer': task_result.best_steering_layer,
891
+ 'strength': task_result.best_steering_strength,
892
+ 'score': task_result.steering_effectiveness_score,
893
+ 'parameters': task_result.optimal_parameters
894
+ }
895
+
896
+ # Update configuration
897
+ self.config_manager.update_model_config(self.model_name, config)
898
+ logger.info(f"āœ… Steering optimization results saved for {self.model_name}")
899
+
900
+ def load_optimal_steering_config(self, task_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
901
+ """
902
+ Load optimal steering configuration for a model/task.
903
+
904
+ Args:
905
+ task_name: Optional task name for task-specific configuration
906
+
907
+ Returns:
908
+ Dictionary with optimal steering parameters or None
909
+ """
910
+ config = self.config_manager.load_model_config(self.model_name)
911
+ if not config:
912
+ return None
913
+
914
+ # Check for task-specific configuration first
915
+ if task_name and 'task_specific_steering' in config:
916
+ task_config = config['task_specific_steering'].get(task_name)
917
+ if task_config:
918
+ return task_config
919
+
920
+ # Fall back to overall best configuration
921
+ if 'steering_optimization' in config:
922
+ steering_opt = config['steering_optimization']
923
+ return {
924
+ 'method': steering_opt.get('best_method'),
925
+ 'layer': steering_opt.get('best_layer'),
926
+ 'strength': steering_opt.get('best_strength')
927
+ }
928
+
929
+ return None
930
+
931
+ def evaluate_steering_effectiveness(
932
+ self,
933
+ task_name: str,
934
+ steering_method: SteeringMethod,
935
+ layer: int,
936
+ strength: float,
937
+ method_params: Dict[str, Any],
938
+ test_samples: List[Dict[str, Any]]
939
+ ) -> Dict[str, float]:
940
+ """
941
+ Evaluate how effectively steering changes model outputs.
942
+
943
+ Args:
944
+ task_name: Task being evaluated
945
+ steering_method: Steering method being used
946
+ layer: Steering layer
947
+ strength: Steering strength
948
+ method_params: Method-specific parameters
949
+ test_samples: Test samples to evaluate on
950
+
951
+ Returns:
952
+ Dictionary with effectiveness metrics
953
+ """
954
+ # Use the internal evaluation method
955
+ score = self._evaluate_steering_configuration(
956
+ task_name=task_name,
957
+ method=steering_method,
958
+ layer=layer,
959
+ strength=strength,
960
+ limit=len(test_samples),
961
+ split_ratio=0.8
962
+ )
963
+
964
+ return {
965
+ 'effectiveness_score': score,
966
+ 'accuracy': score, # For now, use the same score
967
+ 'consistency': 1.0 if score > 0.5 else 0.5,
968
+ 'direction_accuracy': score
969
+ }
970
+
971
+
972
+ # Convenience functions for CLI integration
973
+ def run_steering_optimization(
974
+ model_name: str,
975
+ optimization_type: str = "auto",
976
+ task_name: str = None,
977
+ limit: int = 100,
978
+ device: str = None,
979
+ verbose: bool = False,
980
+ use_classification_config: bool = True,
981
+ **kwargs
982
+ ) -> Union[SteeringOptimizationResult, SteeringOptimizationSummary, Dict[str, Any]]:
983
+ """
984
+ Convenience function to run steering optimization.
985
+
986
+ Args:
987
+ model_name: Model to optimize steering for
988
+ optimization_type: Type of optimization ("auto", "method_comparison", "layer", "strength", "comprehensive")
989
+ task_name: Task to optimize for (if None and optimization_type="auto", uses all classification-optimized tasks)
990
+ limit: Sample limit
991
+ device: Device to use
992
+ verbose: Enable verbose logging
993
+ use_classification_config: Whether to use existing classification config as starting point
994
+ **kwargs: Additional arguments for specific optimization types
995
+
996
+ Returns:
997
+ SteeringOptimizationResult, SteeringOptimizationSummary, or auto-optimization results
998
+ """
999
+ optimizer = SteeringOptimizer(
1000
+ model_name=model_name,
1001
+ device=device,
1002
+ verbose=verbose
1003
+ )
1004
+
1005
+ if optimization_type == "auto":
1006
+ # Automatic optimization based on classification config
1007
+ return run_auto_steering_optimization(
1008
+ model_name=model_name,
1009
+ task_name=task_name,
1010
+ limit=limit,
1011
+ device=device,
1012
+ verbose=verbose,
1013
+ use_classification_config=use_classification_config,
1014
+ **kwargs
1015
+ )
1016
+ elif optimization_type == "method_comparison":
1017
+ if not task_name:
1018
+ raise ValueError("task_name required for method comparison")
1019
+ return optimizer.optimize_steering_method_comparison(
1020
+ task_name=task_name,
1021
+ limit=limit,
1022
+ **kwargs
1023
+ )
1024
+ elif optimization_type == "layer":
1025
+ if not task_name:
1026
+ raise ValueError("task_name required for layer optimization")
1027
+
1028
+ # Convert string steering_method to enum if needed
1029
+ if 'steering_method' in kwargs and isinstance(kwargs['steering_method'], str):
1030
+ kwargs['steering_method'] = SteeringMethod[kwargs['steering_method']]
1031
+
1032
+ return optimizer.optimize_steering_layer(
1033
+ task_name=task_name,
1034
+ limit=limit,
1035
+ **kwargs
1036
+ )
1037
+ elif optimization_type == "strength":
1038
+ if not task_name:
1039
+ raise ValueError("task_name required for strength optimization")
1040
+
1041
+ # Convert string steering_method to enum if needed
1042
+ if 'steering_method' in kwargs and isinstance(kwargs['steering_method'], str):
1043
+ kwargs['steering_method'] = SteeringMethod[kwargs['steering_method']]
1044
+
1045
+ return optimizer.optimize_steering_strength(
1046
+ task_name=task_name,
1047
+ limit=limit,
1048
+ **kwargs
1049
+ )
1050
+ elif optimization_type == "comprehensive":
1051
+ return optimizer.run_comprehensive_steering_optimization(
1052
+ limit=limit,
1053
+ **kwargs
1054
+ )
1055
+ else:
1056
+ raise ValueError(f"Unknown optimization type: {optimization_type}")
1057
+
1058
+
1059
+ def run_auto_steering_optimization(
1060
+ model_name: str,
1061
+ task_name: Optional[str] = None,
1062
+ limit: int = 100,
1063
+ device: str = None,
1064
+ verbose: bool = False,
1065
+ use_classification_config: bool = True,
1066
+ max_time_minutes: float = 60.0,
1067
+ methods_to_test: Optional[List[str]] = None,
1068
+ strength_range: Optional[List[float]] = None,
1069
+ layer_range: Optional[str] = None
1070
+ ) -> Dict[str, Any]:
1071
+ """
1072
+ Automatically optimize steering configuration.
1073
+
1074
+ This function can work either standalone or building on existing classification config.
1075
+
1076
+ Args:
1077
+ model_name: Model to optimize
1078
+ task_name: Specific task to optimize (required if no classification config)
1079
+ limit: Sample limit per evaluation
1080
+ device: Device to use
1081
+ verbose: Enable verbose logging
1082
+ use_classification_config: Use classification layer as starting point
1083
+ max_time_minutes: Maximum time for optimization
1084
+ methods_to_test: List of steering methods to test (defaults to ["CAA", "HPR"])
1085
+ strength_range: List of strengths to test (defaults to [0.5, 1.0, 1.5, 2.0])
1086
+ layer_range: Explicit layer range to search (e.g. "0-5" or "0,2,4")
1087
+
1088
+ Returns:
1089
+ Dictionary with optimization results and saved configuration paths
1090
+ """
1091
+ optimizer = SteeringOptimizer(
1092
+ model_name=model_name,
1093
+ device=device,
1094
+ verbose=verbose
1095
+ )
1096
+
1097
+ # Load classification config if requested
1098
+ config_manager = ModelConfigManager()
1099
+ classification_config = None
1100
+ if use_classification_config:
1101
+ classification_config = config_manager.load_model_config(model_name)
1102
+ if not classification_config:
1103
+ logger.info("ā„¹ļø No classification config found, proceeding with standalone steering optimization")
1104
+
1105
+ # Determine tasks to optimize
1106
+ if task_name:
1107
+ tasks_to_optimize = [task_name]
1108
+ elif classification_config:
1109
+ # First try task-specific overrides
1110
+ if 'task_specific_overrides' in classification_config:
1111
+ tasks_to_optimize = list(classification_config['task_specific_overrides'].keys())
1112
+
1113
+ # If no task-specific overrides, check for tasks from optimization metrics
1114
+ if not tasks_to_optimize and 'optimization_metrics' in classification_config:
1115
+ # Try to get tasks from sample sizes
1116
+ if 'optimal_sample_sizes' in classification_config:
1117
+ tasks_to_optimize = list(classification_config['optimal_sample_sizes'].keys())
1118
+
1119
+ if not tasks_to_optimize:
1120
+ return {"error": "No task specified and no classification tasks found in config"}
1121
+ else:
1122
+ # Require explicit task name if no classification config
1123
+ return {"error": "Task name required when not using classification config"}
1124
+
1125
+ # Default methods and strengths
1126
+ if methods_to_test is None:
1127
+ # Use all default configurations
1128
+ method_configs = get_default_steering_configs()
1129
+ else:
1130
+ # Convert string methods to configs
1131
+ method_configs = []
1132
+ for method in methods_to_test:
1133
+ if method == "CAA":
1134
+ # Add both CAA variations
1135
+ method_configs.append(SteeringMethodConfig("CAA", SteeringMethod.CAA, {}))
1136
+ method_configs.append(SteeringMethodConfig("CAA_L2", SteeringMethod.CAA, {"normalization_method": "l2_unit"}))
1137
+ elif method == "HPR":
1138
+ # Add both HPR variations
1139
+ method_configs.append(SteeringMethodConfig("HPR", SteeringMethod.HPR, {"hpr_beta": 1.0}))
1140
+ method_configs.append(SteeringMethodConfig("HPR_Beta0.5", SteeringMethod.HPR, {"hpr_beta": 0.5}))
1141
+ elif method == "DAC":
1142
+ method_configs.append(SteeringMethodConfig("DAC", SteeringMethod.DAC, {"dac_dynamic_control": True, "dac_entropy_threshold": 1.0}))
1143
+ elif method == "BiPO":
1144
+ # Add both BiPO variations
1145
+ method_configs.append(SteeringMethodConfig("BiPO", SteeringMethod.BIPO, {"bipo_beta": 0.1, "bipo_epochs": 50}))
1146
+ method_configs.append(SteeringMethodConfig("BiPO_Beta0.05", SteeringMethod.BIPO, {"bipo_beta": 0.05, "bipo_epochs": 50}))
1147
+ elif method == "KSteering":
1148
+ # Add both KSteering variations
1149
+ method_configs.append(SteeringMethodConfig("KSteering", SteeringMethod.KSTEERING, {"ksteering_alpha": 5.0, "ksteering_target_labels": "0", "ksteering_avoid_labels": ""}))
1150
+ method_configs.append(SteeringMethodConfig("KSteering_Alpha3", SteeringMethod.KSTEERING, {"ksteering_alpha": 3.0, "ksteering_target_labels": "0", "ksteering_avoid_labels": ""}))
1151
+ else:
1152
+ logger.warning(f"Unknown steering method: {method}")
1153
+
1154
+ if strength_range is None:
1155
+ strength_range = [0.5, 1.0, 1.5, 2.0]
1156
+
1157
+ if verbose:
1158
+ logger.info(f"šŸš€ Starting automatic steering optimization")
1159
+ logger.info(f" Model: {model_name}")
1160
+ logger.info(f" Tasks: {tasks_to_optimize}")
1161
+ logger.info(f" Method configurations: {[cfg.name for cfg in method_configs]}")
1162
+ logger.info(f" Time limit: {max_time_minutes} minutes")
1163
+
1164
+ results = {
1165
+ 'model_name': model_name,
1166
+ 'optimization_date': datetime.now().isoformat(),
1167
+ 'tasks_optimized': [],
1168
+ 'overall_best': None,
1169
+ 'config_saved': False
1170
+ }
1171
+
1172
+ # Optimize each task
1173
+ time_per_task = max_time_minutes / len(tasks_to_optimize)
1174
+
1175
+ for task in tasks_to_optimize:
1176
+ if verbose:
1177
+ logger.info(f"\nšŸ“Š Optimizing steering for task: {task}")
1178
+
1179
+ # Determine layer range
1180
+ task_layer_range = layer_range # Use provided layer range
1181
+ if not task_layer_range and classification_config and use_classification_config:
1182
+ # Only use classification config if no explicit layer range provided
1183
+ task_overrides = classification_config.get('task_specific_overrides', {}).get(task, {})
1184
+ class_layer = task_overrides.get('classification_layer')
1185
+
1186
+ if not class_layer:
1187
+ # Use global classification layer
1188
+ class_layer = classification_config.get('optimal_parameters', {}).get('classification_layer')
1189
+
1190
+ if class_layer:
1191
+ # Search around classification layer
1192
+ task_layer_range = f"{max(0, class_layer-2)}-{class_layer+2}"
1193
+ if verbose:
1194
+ logger.info(f" Using layer range around classification layer {class_layer}: {task_layer_range}")
1195
+
1196
+ # If still no layer range, use default based on model type
1197
+ if not task_layer_range:
1198
+ # Default to searching early to middle layers
1199
+ task_layer_range = "0-5"
1200
+ if verbose:
1201
+ logger.info(f" Using default layer range: {task_layer_range}")
1202
+
1203
+ # Run optimization for this task
1204
+ try:
1205
+ summary = optimizer.optimize_steering_method_comparison(
1206
+ task_name=task,
1207
+ methods_to_test=method_configs,
1208
+ layer_range=task_layer_range,
1209
+ strength_range=strength_range,
1210
+ limit=limit,
1211
+ max_time_minutes=time_per_task
1212
+ )
1213
+
1214
+ # Store results
1215
+ task_result = {
1216
+ 'task': task,
1217
+ 'best_method': summary.best_overall_method,
1218
+ 'best_layer': summary.best_overall_layer,
1219
+ 'best_strength': summary.best_overall_strength,
1220
+ 'score': summary.task_results[0].steering_effectiveness_score if summary.task_results else 0.0
1221
+ }
1222
+ results['tasks_optimized'].append(task_result)
1223
+
1224
+ # Update overall best
1225
+ if not results['overall_best'] or task_result['score'] > results['overall_best']['score']:
1226
+ results['overall_best'] = task_result
1227
+
1228
+ except Exception as e:
1229
+ logger.error(f"āŒ Failed to optimize task {task}: {e}")
1230
+ results['tasks_optimized'].append({
1231
+ 'task': task,
1232
+ 'error': str(e)
1233
+ })
1234
+
1235
+ # Save configuration
1236
+ if results['tasks_optimized'] and not any('error' in r for r in results['tasks_optimized']):
1237
+ results['config_saved'] = True
1238
+ results['config_path'] = config_manager._get_config_path(model_name)
1239
+
1240
+ if verbose:
1241
+ logger.info(f"\nāœ… Steering optimization complete!")
1242
+ logger.info(f" Configuration saved to: {results['config_path']}")
1243
+ logger.info(f" Overall best: {results['overall_best']['best_method']} "
1244
+ f"L{results['overall_best']['best_layer']} "
1245
+ f"S{results['overall_best']['best_strength']}")
1246
+
1247
+ return results
1248
+
1249
+
1250
+ def get_optimal_steering_params(
1251
+ model_name: str,
1252
+ task_name: Optional[str] = None
1253
+ ) -> Optional[Dict[str, Any]]:
1254
+ """
1255
+ Get optimal steering parameters for a model/task.
1256
+
1257
+ Args:
1258
+ model_name: Model name
1259
+ task_name: Optional task name for task-specific params
1260
+
1261
+ Returns:
1262
+ Dictionary with steering parameters or None
1263
+ """
1264
+ optimizer = SteeringOptimizer(model_name)
1265
+ return optimizer.load_optimal_steering_config(task_name)
1266
+
1267
+
1268
+ # TODO: Integration with existing steering methods
1269
+ #
1270
+ # The following integration points need to be implemented:
1271
+ #
1272
+ # 1. CAA Integration:
1273
+ # - Load existing CAA implementation from wisent.core.steering_methods.caa
1274
+ # - Implement parameter optimization for CAA vectors
1275
+ # - Measure CAA steering effectiveness
1276
+ #
1277
+ # 2. HPR Integration:
1278
+ # - Load HPR implementation and optimize beta parameter
1279
+ # - Test rotation effectiveness across different layers
1280
+ #
1281
+ # 3. DAC Integration:
1282
+ # - Optimize dynamic control parameters and entropy thresholds
1283
+ # - Test adaptive steering strength adjustment
1284
+ #
1285
+ # 4. BiPO Integration:
1286
+ # - Optimize learning parameters for preference-based steering
1287
+ # - Implement bi-directional steering evaluation
1288
+ #
1289
+ # 5. K-Steering Integration:
1290
+ # - Optimize classifier parameters and label configurations
1291
+ # - Test multi-label steering effectiveness
1292
+ #
1293
+ # 6. Effectiveness Metrics:
1294
+ # - Implement steering strength measurement
1295
+ # - Develop steering direction accuracy metrics
1296
+ # - Create steering consistency evaluation
1297
+ # - Measure classification accuracy preservation