wisent 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (225) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/budget.py +2 -2
  7. wisent/core/agent/device_benchmarks.py +1 -1
  8. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  9. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  10. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  11. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  12. wisent/core/agent/diagnose.py +2 -1
  13. wisent/core/autonomous_agent.py +10 -2
  14. wisent/core/benchmark_extractors.py +293 -0
  15. wisent/core/bigcode_integration.py +20 -7
  16. wisent/core/branding.py +108 -0
  17. wisent/core/cli/__init__.py +15 -0
  18. wisent/core/cli/create_steering_vector.py +138 -0
  19. wisent/core/cli/evaluate_responses.py +715 -0
  20. wisent/core/cli/generate_pairs.py +128 -0
  21. wisent/core/cli/generate_pairs_from_task.py +119 -0
  22. wisent/core/cli/generate_responses.py +129 -0
  23. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  24. wisent/core/cli/generate_vector_from_task.py +147 -0
  25. wisent/core/cli/get_activations.py +191 -0
  26. wisent/core/cli/optimize_classification.py +339 -0
  27. wisent/core/cli/optimize_steering.py +364 -0
  28. wisent/core/cli/tasks.py +182 -0
  29. wisent/core/cli_logger.py +22 -0
  30. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  31. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  32. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  82. wisent/core/data_loaders/__init__.py +235 -0
  83. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  84. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  85. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  86. wisent/core/download_full_benchmarks.py +79 -2
  87. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  88. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  89. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  92. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  93. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  94. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  95. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  97. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  98. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  99. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  100. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  101. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  102. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  103. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  104. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  105. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  106. wisent/core/main.py +57 -0
  107. wisent/core/model_persistence.py +2 -2
  108. wisent/core/models/wisent_model.py +6 -6
  109. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  110. wisent/core/optuna/steering/steering_optimization.py +1 -1
  111. wisent/core/parser_arguments/__init__.py +10 -0
  112. wisent/core/parser_arguments/agent_parser.py +110 -0
  113. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  114. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  115. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  116. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  117. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  118. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  119. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  120. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  121. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  122. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  123. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  124. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  125. wisent/core/parser_arguments/main_parser.py +152 -0
  126. wisent/core/parser_arguments/model_config_parser.py +59 -0
  127. wisent/core/parser_arguments/monitor_parser.py +17 -0
  128. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  129. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  130. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  131. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  132. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  133. wisent/core/parser_arguments/tasks_parser.py +584 -0
  134. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  135. wisent/core/parser_arguments/utils.py +111 -0
  136. wisent/core/prompts/core/prompt_formater.py +3 -3
  137. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  138. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  139. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  141. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  142. wisent/core/steering_optimizer.py +45 -21
  143. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  144. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  145. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  146. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  147. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  148. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  149. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  150. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  151. wisent/core/tasks/livecodebench_task.py +4 -103
  152. wisent/core/timing_calibration.py +1 -1
  153. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
  154. wisent-0.5.13.dist-info/RECORD +294 -0
  155. wisent-0.5.13.dist-info/entry_points.txt +2 -0
  156. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  157. wisent/classifiers/core/atoms.py +0 -747
  158. wisent/classifiers/models/logistic.py +0 -29
  159. wisent/classifiers/models/mlp.py +0 -47
  160. wisent/cli/classifiers/classifier_rotator.py +0 -137
  161. wisent/cli/cli_logger.py +0 -142
  162. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  163. wisent/cli/wisent_cli/commands/listing.py +0 -154
  164. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  165. wisent/cli/wisent_cli/main.py +0 -93
  166. wisent/cli/wisent_cli/shell.py +0 -80
  167. wisent/cli/wisent_cli/ui.py +0 -69
  168. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  169. wisent/cli/wisent_cli/util/parsing.py +0 -126
  170. wisent/cli/wisent_cli/version.py +0 -4
  171. wisent/opti/methods/__init__.py +0 -0
  172. wisent/synthetic/__init__.py +0 -0
  173. wisent/synthetic/cleaners/__init__.py +0 -0
  174. wisent/synthetic/cleaners/core/__init__.py +0 -0
  175. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  176. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  177. wisent/synthetic/db_instructions/__init__.py +0 -0
  178. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  179. wisent/synthetic/generators/__init__.py +0 -0
  180. wisent/synthetic/generators/core/__init__.py +0 -0
  181. wisent/synthetic/generators/diversities/__init__.py +0 -0
  182. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  184. wisent-0.5.12.dist-info/RECORD +0 -220
  185. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  186. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  187. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  188. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  189. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  190. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  191. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  192. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  193. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  195. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  197. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  198. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  199. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  200. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  201. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  202. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  203. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  205. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  206. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  207. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  208. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  209. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  210. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  211. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  212. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  213. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  214. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  215. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  216. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  217. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  218. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  219. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  220. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  221. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  222. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  223. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
  224. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
  225. {wisent-0.5.12.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,147 @@
1
+ """Parser setup for the 'optimize-steering' command."""
2
+
3
+
4
+ def setup_steering_optimizer_parser(parser):
5
+ """Set up the steering-optimizer subcommand parser."""
6
+ # Create subparsers for different steering optimization types
7
+ steering_subparsers = parser.add_subparsers(dest="steering_action", help="Steering optimization actions")
8
+
9
+ # Comprehensive optimization subcommand
10
+ comprehensive_parser = steering_subparsers.add_parser(
11
+ "comprehensive", help="Run comprehensive steering optimization"
12
+ )
13
+ comprehensive_parser.add_argument("model", type=str, help="Model name or path")
14
+ comprehensive_parser.add_argument(
15
+ "--tasks",
16
+ type=str,
17
+ nargs="+",
18
+ default=None,
19
+ help="Tasks to optimize (defaults to classification-optimized tasks)",
20
+ )
21
+ comprehensive_parser.add_argument(
22
+ "--methods",
23
+ type=str,
24
+ nargs="+",
25
+ choices=["CAA", "HPR", "DAC", "BiPO", "KSteering"],
26
+ default=["CAA", "HPR"],
27
+ help="Steering methods to test",
28
+ )
29
+ comprehensive_parser.add_argument("--limit", type=int, default=100, help="Sample limit per task (default: 100)")
30
+ comprehensive_parser.add_argument(
31
+ "--max-time-per-task", type=float, default=20.0, help="Time limit per task in minutes (default: 20.0)"
32
+ )
33
+ comprehensive_parser.add_argument("--no-save", action="store_true", help="Don't save results to model config")
34
+ comprehensive_parser.add_argument("--device", type=str, default=None, help="Device to run on")
35
+ comprehensive_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
36
+
37
+ # Method comparison subcommand
38
+ method_parser = steering_subparsers.add_parser(
39
+ "compare-methods", help="Compare different steering methods for a task"
40
+ )
41
+ method_parser.add_argument("model", type=str, help="Model name or path")
42
+ method_parser.add_argument(
43
+ "--task", type=str, default="truthfulqa_mc1", help="Task to optimize steering for (default: truthfulqa_mc1)"
44
+ )
45
+ method_parser.add_argument(
46
+ "--methods",
47
+ type=str,
48
+ nargs="+",
49
+ choices=["CAA", "HPR", "DAC", "BiPO", "KSteering"],
50
+ default=["CAA", "HPR"],
51
+ help="Steering methods to compare",
52
+ )
53
+ method_parser.add_argument("--limit", type=int, default=100, help="Maximum samples for testing (default: 100)")
54
+ method_parser.add_argument(
55
+ "--max-time", type=float, default=30.0, help="Maximum optimization time in minutes (default: 30.0)"
56
+ )
57
+ method_parser.add_argument("--device", type=str, default=None, help="Device to run on")
58
+ method_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
59
+
60
+ # Layer optimization subcommand
61
+ layer_parser = steering_subparsers.add_parser("optimize-layer", help="Find optimal steering layer for a method")
62
+ layer_parser.add_argument("model", type=str, help="Model name or path")
63
+ layer_parser.add_argument(
64
+ "--task", type=str, default="truthfulqa_mc1", help="Task to optimize for (default: truthfulqa_mc1)"
65
+ )
66
+ layer_parser.add_argument(
67
+ "--method",
68
+ type=str,
69
+ default="CAA",
70
+ choices=["CAA", "HPR", "DAC", "BiPO", "KSteering"],
71
+ help="Steering method to use (default: CAA)",
72
+ )
73
+ layer_parser.add_argument("--layer-range", type=str, default=None, help="Layer range to search (e.g., '10-20')")
74
+ layer_parser.add_argument(
75
+ "--strength", type=float, default=1.0, help="Fixed steering strength during layer search (default: 1.0)"
76
+ )
77
+ layer_parser.add_argument("--limit", type=int, default=100, help="Maximum samples for testing (default: 100)")
78
+ layer_parser.add_argument("--device", type=str, default=None, help="Device to run on")
79
+ layer_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
80
+
81
+ # Strength optimization subcommand
82
+ strength_parser = steering_subparsers.add_parser("optimize-strength", help="Find optimal steering strength")
83
+ strength_parser.add_argument("model", type=str, help="Model name or path")
84
+ strength_parser.add_argument(
85
+ "--task", type=str, default="truthfulqa_mc1", help="Task to optimize for (default: truthfulqa_mc1)"
86
+ )
87
+ strength_parser.add_argument(
88
+ "--method",
89
+ type=str,
90
+ default="CAA",
91
+ choices=["CAA", "HPR", "DAC", "BiPO", "KSteering"],
92
+ help="Steering method to use (default: CAA)",
93
+ )
94
+ strength_parser.add_argument(
95
+ "--layer", type=int, default=None, help="Steering layer to use (defaults to classification layer)"
96
+ )
97
+ strength_parser.add_argument(
98
+ "--strength-range",
99
+ type=float,
100
+ nargs=2,
101
+ default=[0.1, 2.0],
102
+ help="Min and max strength to test (default: 0.1 2.0)",
103
+ )
104
+ strength_parser.add_argument(
105
+ "--strength-steps", type=int, default=10, help="Number of strength values to test (default: 10)"
106
+ )
107
+ strength_parser.add_argument("--limit", type=int, default=100, help="Maximum samples for testing (default: 100)")
108
+ strength_parser.add_argument("--device", type=str, default=None, help="Device to run on")
109
+ strength_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
110
+
111
+ # Auto optimization subcommand
112
+ auto_parser = steering_subparsers.add_parser(
113
+ "auto", help="Automatically optimize steering based on classification config"
114
+ )
115
+ auto_parser.add_argument("model", type=str, help="Model name or path")
116
+ auto_parser.add_argument(
117
+ "--task",
118
+ type=str,
119
+ default=None,
120
+ help="Specific task to optimize (defaults to all classification-optimized tasks)",
121
+ )
122
+ auto_parser.add_argument(
123
+ "--methods",
124
+ type=str,
125
+ nargs="+",
126
+ choices=["CAA", "HPR", "DAC", "BiPO", "KSteering"],
127
+ default=["CAA", "HPR"],
128
+ help="Steering methods to test (default: CAA, HPR)",
129
+ )
130
+ auto_parser.add_argument("--limit", type=int, default=100, help="Maximum samples for testing (default: 100)")
131
+ auto_parser.add_argument("--max-time", type=float, default=60.0, help="Maximum time in minutes (default: 60)")
132
+ auto_parser.add_argument(
133
+ "--strength-range",
134
+ type=float,
135
+ nargs="+",
136
+ default=[0.5, 1.0, 1.5, 2.0],
137
+ help="Steering strengths to test (default: 0.5 1.0 1.5 2.0)",
138
+ )
139
+ auto_parser.add_argument(
140
+ "--layer-range",
141
+ type=str,
142
+ default=None,
143
+ help="Explicit layer range to search (e.g., '0-5' or '0,2,4'). If not specified, uses classification layer or defaults to 0-5",
144
+ )
145
+ auto_parser.add_argument("--device", type=str, default=None, help="Device to run on")
146
+ auto_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
147
+
@@ -0,0 +1,93 @@
1
+ """Parser setup for the 'synthetic' command."""
2
+
3
+
4
+ def setup_synthetic_parser(parser):
5
+ """Set up the synthetic subcommand parser."""
6
+ # Either generate new pairs or load existing ones
7
+ group = parser.add_mutually_exclusive_group(required=True)
8
+ group.add_argument(
9
+ "--trait", type=str, help="Natural language description of the desired trait or behavior (generates new pairs)"
10
+ )
11
+ group.add_argument("--pairs-file", type=str, help="Path to existing JSON file with contrastive pairs")
12
+
13
+ # Generation parameters (only used if --trait is specified)
14
+ parser.add_argument(
15
+ "--num-pairs",
16
+ type=int,
17
+ default=30,
18
+ help="Number of contrastive pairs to generate (default: 30, only used with --trait)",
19
+ )
20
+ parser.add_argument(
21
+ "--save-pairs",
22
+ type=str,
23
+ default=None,
24
+ help="Save generated pairs to this file (optional, only used with --trait)",
25
+ )
26
+
27
+ # Model and device
28
+ parser.add_argument("--model", type=str, default="meta-llama/Llama-3.1-8B-Instruct", help="Model name or path")
29
+ parser.add_argument("--device", type=str, default=None, help="Device to run on")
30
+
31
+ # Training/evaluation parameters
32
+ parser.add_argument("--layer", type=str, default="15", help="Layer(s) to extract activations from")
33
+ parser.add_argument(
34
+ "--steering-method",
35
+ type=str,
36
+ default="CAA",
37
+ choices=["CAA", "HPR", "DAC", "BiPO", "KSteering"],
38
+ help="Steering method to use",
39
+ )
40
+ parser.add_argument("--steering-strength", type=float, default=1.0, help="Strength of steering vector application")
41
+ parser.add_argument(
42
+ "--test-questions", type=int, default=5, help="Number of test questions to generate for evaluation"
43
+ )
44
+
45
+ # Output
46
+ parser.add_argument("--output", type=str, default="./results", help="Output directory for results")
47
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
48
+
49
+ # K-Steering specific parameters
50
+ parser.add_argument(
51
+ "--ksteering-target-labels", type=str, default="0", help="Comma-separated target label indices for K-steering"
52
+ )
53
+ parser.add_argument(
54
+ "--ksteering-avoid-labels", type=str, default="", help="Comma-separated avoid label indices for K-steering"
55
+ )
56
+ parser.add_argument("--ksteering-alpha", type=float, default=50.0, help="Alpha parameter for K-steering")
57
+
58
+ # Nonsense detection options
59
+ parser.add_argument(
60
+ "--enable-nonsense-detection",
61
+ action="store_true",
62
+ help="Enable nonsense detection to stop lobotomized responses",
63
+ )
64
+ parser.add_argument(
65
+ "--max-word-length",
66
+ type=int,
67
+ default=20,
68
+ help="Maximum reasonable word length for nonsense detection (default: 20)",
69
+ )
70
+ parser.add_argument(
71
+ "--repetition-threshold",
72
+ type=float,
73
+ default=0.7,
74
+ help="Threshold for repetitive content detection (0-1, default: 0.7)",
75
+ )
76
+ parser.add_argument(
77
+ "--gibberish-threshold",
78
+ type=float,
79
+ default=0.3,
80
+ help="Threshold for gibberish word detection (0-1, default: 0.3)",
81
+ )
82
+ parser.add_argument(
83
+ "--disable-dictionary-check",
84
+ action="store_true",
85
+ help="Disable dictionary-based word validation (faster but less accurate)",
86
+ )
87
+ parser.add_argument(
88
+ "--nonsense-action",
89
+ type=str,
90
+ default="regenerate",
91
+ choices=["regenerate", "stop", "flag"],
92
+ help="Action when nonsense is detected: regenerate, stop generation, or flag for review",
93
+ )