wisent 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (225) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/budget.py +2 -2
  7. wisent/core/agent/device_benchmarks.py +1 -1
  8. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  9. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  10. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  11. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  12. wisent/core/agent/diagnose.py +2 -1
  13. wisent/core/autonomous_agent.py +10 -2
  14. wisent/core/benchmark_extractors.py +293 -0
  15. wisent/core/bigcode_integration.py +20 -7
  16. wisent/core/branding.py +108 -0
  17. wisent/core/cli/__init__.py +15 -0
  18. wisent/core/cli/create_steering_vector.py +138 -0
  19. wisent/core/cli/evaluate_responses.py +715 -0
  20. wisent/core/cli/generate_pairs.py +128 -0
  21. wisent/core/cli/generate_pairs_from_task.py +119 -0
  22. wisent/core/cli/generate_responses.py +129 -0
  23. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  24. wisent/core/cli/generate_vector_from_task.py +147 -0
  25. wisent/core/cli/get_activations.py +191 -0
  26. wisent/core/cli/optimize_classification.py +339 -0
  27. wisent/core/cli/optimize_steering.py +364 -0
  28. wisent/core/cli/tasks.py +182 -0
  29. wisent/core/cli_logger.py +22 -0
  30. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  31. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  32. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  82. wisent/core/data_loaders/__init__.py +235 -0
  83. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  84. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  85. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  86. wisent/core/download_full_benchmarks.py +79 -2
  87. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  88. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  89. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  92. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  93. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  94. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  95. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  97. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  98. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  99. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  100. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  101. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  102. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  103. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  104. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  105. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  106. wisent/core/main.py +57 -0
  107. wisent/core/model_persistence.py +2 -2
  108. wisent/core/models/wisent_model.py +8 -6
  109. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  110. wisent/core/optuna/steering/steering_optimization.py +1 -1
  111. wisent/core/parser_arguments/__init__.py +10 -0
  112. wisent/core/parser_arguments/agent_parser.py +110 -0
  113. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  114. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  115. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  116. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  117. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  118. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  119. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  120. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  121. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  122. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  123. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  124. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  125. wisent/core/parser_arguments/main_parser.py +152 -0
  126. wisent/core/parser_arguments/model_config_parser.py +59 -0
  127. wisent/core/parser_arguments/monitor_parser.py +17 -0
  128. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  129. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  130. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  131. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  132. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  133. wisent/core/parser_arguments/tasks_parser.py +584 -0
  134. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  135. wisent/core/parser_arguments/utils.py +111 -0
  136. wisent/core/prompts/core/prompt_formater.py +3 -3
  137. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  138. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  139. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  141. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  142. wisent/core/steering_optimizer.py +45 -21
  143. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  144. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  145. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  146. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  147. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  148. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  149. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  150. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  151. wisent/core/tasks/livecodebench_task.py +4 -103
  152. wisent/core/timing_calibration.py +1 -1
  153. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
  154. wisent-0.5.13.dist-info/RECORD +294 -0
  155. wisent-0.5.13.dist-info/entry_points.txt +2 -0
  156. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  157. wisent/classifiers/core/atoms.py +0 -747
  158. wisent/classifiers/models/logistic.py +0 -29
  159. wisent/classifiers/models/mlp.py +0 -47
  160. wisent/cli/classifiers/classifier_rotator.py +0 -137
  161. wisent/cli/cli_logger.py +0 -142
  162. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  163. wisent/cli/wisent_cli/commands/listing.py +0 -154
  164. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  165. wisent/cli/wisent_cli/main.py +0 -93
  166. wisent/cli/wisent_cli/shell.py +0 -80
  167. wisent/cli/wisent_cli/ui.py +0 -69
  168. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  169. wisent/cli/wisent_cli/util/parsing.py +0 -126
  170. wisent/cli/wisent_cli/version.py +0 -4
  171. wisent/opti/methods/__init__.py +0 -0
  172. wisent/synthetic/__init__.py +0 -0
  173. wisent/synthetic/cleaners/__init__.py +0 -0
  174. wisent/synthetic/cleaners/core/__init__.py +0 -0
  175. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  176. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  177. wisent/synthetic/db_instructions/__init__.py +0 -0
  178. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  179. wisent/synthetic/generators/__init__.py +0 -0
  180. wisent/synthetic/generators/core/__init__.py +0 -0
  181. wisent/synthetic/generators/diversities/__init__.py +0 -0
  182. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  184. wisent-0.5.11.dist-info/RECORD +0 -220
  185. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  186. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  187. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  188. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  189. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  190. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  191. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  192. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  193. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  195. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  197. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  198. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  199. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  200. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  201. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  202. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  203. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  205. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  206. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  207. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  208. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  209. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  210. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  211. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  212. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  213. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  214. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  215. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  216. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  217. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  218. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  219. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  220. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  221. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  222. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  223. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
  224. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
  225. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,584 @@
1
+ """
2
+ Parser setup for the 'tasks' command.
3
+
4
+ This command runs evaluation tasks on language models.
5
+ """
6
+
7
+
8
+ def setup_tasks_parser(parser):
9
+ """Set up the tasks subcommand parser."""
10
+
11
+ # Task listing options (mutually exclusive with task execution)
12
+ list_group = parser.add_mutually_exclusive_group()
13
+ list_group.add_argument(
14
+ "--list-tasks",
15
+ action="store_true",
16
+ help="List all 37 available benchmark tasks organized by priority (excludes 28 known problematic benchmarks)",
17
+ )
18
+ list_group.add_argument(
19
+ "--task-info", type=str, metavar="TASK_NAME", help="Show detailed information about a specific task"
20
+ )
21
+ list_group.add_argument("--all", action="store_true", help="Run all 37 available benchmarks automatically")
22
+
23
+ # Task execution argument (optional when using listing commands or --all)
24
+ parser.add_argument(
25
+ "task_names",
26
+ nargs="?",
27
+ help="Comma-separated list of available task names (37 working benchmarks), or path to CSV/JSON file with --from-csv/--from-json (not needed with --all)",
28
+ )
29
+
30
+ # Skills/risks based task selection
31
+ parser.add_argument(
32
+ "--skills", type=str, nargs="+", help="Select tasks by skill categories (e.g., coding, mathematics, reasoning)"
33
+ )
34
+ parser.add_argument(
35
+ "--risks",
36
+ type=str,
37
+ nargs="+",
38
+ help="Select tasks by risk categories (e.g., harmfulness, toxicity, hallucination)",
39
+ )
40
+ parser.add_argument(
41
+ "--num-tasks",
42
+ type=int,
43
+ default=None,
44
+ help="Number of tasks to randomly select from matched tasks (default: all)",
45
+ )
46
+ parser.add_argument(
47
+ "--min-quality-score",
48
+ type=int,
49
+ default=2,
50
+ choices=[1, 2, 3, 4, 5],
51
+ help="Minimum quality score for tasks when using --skills/--risks (default: 2)",
52
+ )
53
+ parser.add_argument(
54
+ "--task-seed", type=int, default=None, help="Random seed for task selection (for reproducibility)"
55
+ )
56
+
57
+ # Mixed sampling from multiple benchmarks
58
+ parser.add_argument(
59
+ "--tag",
60
+ type=str,
61
+ nargs="+",
62
+ help="Sample randomly from all benchmarks with these tags (e.g., --tag coding). Creates a mixed dataset from multiple benchmarks.",
63
+ )
64
+ parser.add_argument(
65
+ "--mixed-samples",
66
+ type=int,
67
+ default=1000,
68
+ help="Total number of samples to collect when using --tag (default: 1000)",
69
+ )
70
+ parser.add_argument(
71
+ "--tag-mode",
72
+ type=str,
73
+ choices=["any", "all"],
74
+ default="any",
75
+ help="Whether benchmarks must have ANY or ALL specified tags (default: any)",
76
+ )
77
+
78
+ # Cross-benchmark evaluation
79
+ parser.add_argument(
80
+ "--train-task", type=str, help="Task/benchmark to train on (can be a task name or --tag for mixed)"
81
+ )
82
+ parser.add_argument(
83
+ "--eval-task", type=str, help="Task/benchmark to evaluate on (can be a task name or --tag for mixed)"
84
+ )
85
+ parser.add_argument(
86
+ "--train-tag", type=str, nargs="+", help="Tags for training data when using cross-benchmark evaluation"
87
+ )
88
+ parser.add_argument(
89
+ "--eval-tag", type=str, nargs="+", help="Tags for evaluation data when using cross-benchmark evaluation"
90
+ )
91
+ parser.add_argument(
92
+ "--cross-benchmark",
93
+ action="store_true",
94
+ help="Enable cross-benchmark evaluation mode (train on one, eval on another)",
95
+ )
96
+
97
+ # Synthetic pair generation
98
+ parser.add_argument(
99
+ "--synthetic", action="store_true", help="Generate synthetic contrastive pairs from a trait description"
100
+ )
101
+ parser.add_argument(
102
+ "--trait",
103
+ type=str,
104
+ help="Natural language description of desired model behavior (e.g., 'hallucinates less', 'more factual', 'less verbose')",
105
+ )
106
+ parser.add_argument(
107
+ "--num-synthetic-pairs", type=int, default=30, help="Number of synthetic pairs to generate (default: 30)"
108
+ )
109
+ parser.add_argument("--save-synthetic", type=str, help="Path to save generated synthetic pairs as JSON")
110
+ parser.add_argument(
111
+ "--load-synthetic", type=str, help="Path to load previously generated synthetic pairs from JSON"
112
+ )
113
+
114
+ parser.add_argument("--model", type=str, default="meta-llama/Llama-3.1-8B-Instruct", help="Model name or path")
115
+ parser.add_argument(
116
+ "--layer",
117
+ type=str,
118
+ default="15",
119
+ help="Layer(s) to extract activations from. Can be a single layer (15), range (14-16), or comma-separated list (14,15,16)",
120
+ )
121
+ parser.add_argument("--shots", type=int, default=0, help="Number of few-shot examples")
122
+ parser.add_argument("--split-ratio", type=float, default=0.8, help="Train/test split ratio")
123
+ parser.add_argument("--limit", type=int, default=None, help="Limit number of documents per task")
124
+ parser.add_argument(
125
+ "--training-limit",
126
+ type=int,
127
+ default=None,
128
+ help="Limit number of training documents (overrides limit for training)",
129
+ )
130
+ parser.add_argument(
131
+ "--testing-limit",
132
+ type=int,
133
+ default=None,
134
+ help="Limit number of testing documents (overrides limit for testing)",
135
+ )
136
+ parser.add_argument("--output", type=str, default="./results", help="Output directory for results")
137
+ parser.add_argument(
138
+ "--classifier-type", type=str, choices=["logistic", "mlp"], default="logistic", help="Type of classifier"
139
+ )
140
+ parser.add_argument("--max-new-tokens", type=int, default=300, help="Maximum new tokens for generation")
141
+ parser.add_argument("--device", type=str, default=None, help="Device to run on")
142
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
143
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
144
+ parser.add_argument(
145
+ "--token-aggregation",
146
+ type=str,
147
+ choices=["average", "final", "first", "max", "min"],
148
+ default="average",
149
+ help="How to aggregate token scores for classification",
150
+ )
151
+ parser.add_argument(
152
+ "--ground-truth-method",
153
+ type=str,
154
+ choices=[
155
+ "none",
156
+ "exact_match",
157
+ "substring_match",
158
+ "user_specified",
159
+ "interactive",
160
+ "manual_review",
161
+ "good",
162
+ "lm-eval-harness",
163
+ ],
164
+ default="lm-eval-harness",
165
+ help="Method for ground truth evaluation. 'lm-eval-harness' uses lm-eval-harness tasks for evaluation (default for most tasks), 'none' skips evaluation, 'exact_match' and 'substring_match' are problematic for free-form generation, 'user_specified' allows manual labeling, 'interactive' prompts for y/n labeling, 'manual_review' marks for review, 'good' marks everything as truthful (for debugging)",
166
+ )
167
+ parser.add_argument(
168
+ "--user-labels",
169
+ type=str,
170
+ nargs="*",
171
+ default=None,
172
+ help="User-specified ground truth labels for responses ('truthful' or 'hallucination'). Used with --ground-truth-method user_specified",
173
+ )
174
+
175
+ # File input arguments
176
+ parser.add_argument(
177
+ "--from-csv",
178
+ action="store_true",
179
+ help="Load task data from CSV file. Requires columns: question, correct_answer, incorrect_answer",
180
+ )
181
+ parser.add_argument(
182
+ "--from-json",
183
+ action="store_true",
184
+ help="Load task data from JSON file. Expected format: list of objects with question, correct_answer, incorrect_answer",
185
+ )
186
+ parser.add_argument(
187
+ "--question-col", type=str, default="question", help="Column name for questions in CSV file (default: question)"
188
+ )
189
+ parser.add_argument(
190
+ "--correct-col",
191
+ type=str,
192
+ default="correct_answer",
193
+ help="Column name for correct answers in CSV file (default: correct_answer)",
194
+ )
195
+ parser.add_argument(
196
+ "--incorrect-col",
197
+ type=str,
198
+ default="incorrect_answer",
199
+ help="Column name for incorrect answers in CSV file (default: incorrect_answer)",
200
+ )
201
+
202
+ # Optimization arguments
203
+ parser.add_argument(
204
+ "--optimize",
205
+ action="store_true",
206
+ help="Enable hyperparameter optimization. When enabled, will find optimal layer, threshold, and aggregation method",
207
+ )
208
+ parser.add_argument(
209
+ "--optimize-layers",
210
+ type=str,
211
+ default="all",
212
+ help="Layer range for optimization (e.g., '8-24' or '10,15,20' or 'all'). Default: all (uses all model layers)",
213
+ )
214
+ parser.add_argument(
215
+ "--optimize-metric",
216
+ type=str,
217
+ choices=["accuracy", "f1", "precision", "recall", "auc"],
218
+ default="f1",
219
+ help="Metric to optimize for. Default: f1",
220
+ )
221
+ parser.add_argument(
222
+ "--optimize-max-combinations",
223
+ type=int,
224
+ default=100,
225
+ help="Maximum number of hyperparameter combinations to test. Default: 100",
226
+ )
227
+ parser.add_argument(
228
+ "--auto-optimize",
229
+ action="store_true",
230
+ help="Automatically enable optimization when layer is not specified or is -1",
231
+ )
232
+
233
+ # Dataset validation arguments
234
+ parser.add_argument(
235
+ "--allow-small-dataset",
236
+ action="store_true",
237
+ help="Allow training with datasets smaller than 4 samples (may cause training issues)",
238
+ )
239
+
240
+ # Detection handling arguments
241
+ parser.add_argument(
242
+ "--detection-action",
243
+ type=str,
244
+ choices=["pass_through", "replace_with_placeholder", "regenerate_until_safe"],
245
+ default="pass_through",
246
+ help="Action to take when problematic content is detected (default: pass_through)",
247
+ )
248
+ parser.add_argument(
249
+ "--placeholder-message",
250
+ type=str,
251
+ default=None,
252
+ help="Custom placeholder message for detected content (if not specified, uses default)",
253
+ )
254
+ parser.add_argument(
255
+ "--max-regeneration-attempts",
256
+ type=int,
257
+ default=3,
258
+ help="Maximum attempts to regenerate safe content (default: 3)",
259
+ )
260
+ parser.add_argument(
261
+ "--detection-threshold",
262
+ type=float,
263
+ default=0.6,
264
+ help="Threshold for classification (higher = more strict detection) (default: 0.6)",
265
+ )
266
+ parser.add_argument("--log-detections", action="store_true", help="Enable logging of detection events")
267
+
268
+ # Code execution security arguments
269
+ parser.add_argument(
270
+ "--trust-code-execution",
271
+ action="store_true",
272
+ help="⚠️ UNSAFE: Allow code execution without Docker in trusted sandbox environments (e.g., RunPod containers). Use only in secure, isolated environments!",
273
+ )
274
+
275
+ # Steering mode arguments
276
+ parser.add_argument(
277
+ "--steering-mode", action="store_true", help="Enable steering mode (uses CAA vectors instead of classification)"
278
+ )
279
+ parser.add_argument(
280
+ "--steering-strength", type=float, default=1.0, help="Strength of steering vector application (default: 1.0)"
281
+ )
282
+
283
+ # Steering method selection
284
+ parser.add_argument(
285
+ "--steering-method",
286
+ type=str,
287
+ default="CAA",
288
+ choices=["CAA", "HPR", "DAC", "BiPO", "KSteering"],
289
+ help="Steering method to use",
290
+ )
291
+
292
+ # Steering output mode selection
293
+ parser.add_argument(
294
+ "--output-mode",
295
+ type=str,
296
+ default="both",
297
+ choices=["likelihoods", "responses", "both"],
298
+ help="Type of comparison to show: 'likelihoods' for log-likelihood comparison only, 'responses' for response generation only, 'both' for both (default: both)",
299
+ )
300
+
301
+ # HPR-specific parameters
302
+ parser.add_argument("--hpr-beta", type=float, default=1.0, help="Beta parameter for HPR method")
303
+
304
+ # DAC-specific parameters
305
+ parser.add_argument("--dac-dynamic-control", action="store_true", help="Enable dynamic control for DAC method")
306
+ parser.add_argument(
307
+ "--dac-entropy-threshold", type=float, default=1.0, help="Entropy threshold for DAC dynamic control"
308
+ )
309
+
310
+ # BiPO-specific parameters
311
+ parser.add_argument("--bipo-beta", type=float, default=0.1, help="Beta parameter for BiPO method")
312
+ parser.add_argument("--bipo-learning-rate", type=float, default=5e-4, help="Learning rate for BiPO method")
313
+ parser.add_argument("--bipo-epochs", type=int, default=100, help="Number of epochs for BiPO training")
314
+
315
+ # K-Steering-specific parameters
316
+ parser.add_argument(
317
+ "--ksteering-num-labels", type=int, default=6, help="Number of labels for K-steering classifier"
318
+ )
319
+ parser.add_argument(
320
+ "--ksteering-hidden-dim", type=int, default=512, help="Hidden dimension for K-steering classifier"
321
+ )
322
+ parser.add_argument(
323
+ "--ksteering-learning-rate", type=float, default=1e-3, help="Learning rate for K-steering classifier training"
324
+ )
325
+ parser.add_argument(
326
+ "--ksteering-classifier-epochs",
327
+ type=int,
328
+ default=100,
329
+ help="Number of epochs for K-steering classifier training",
330
+ )
331
+ parser.add_argument(
332
+ "--ksteering-target-labels",
333
+ type=str,
334
+ default="0",
335
+ help="Comma-separated target label indices for K-steering (e.g., '0,1,2')",
336
+ )
337
+ parser.add_argument(
338
+ "--ksteering-avoid-labels",
339
+ type=str,
340
+ default="",
341
+ help="Comma-separated avoid label indices for K-steering (e.g., '3,4,5')",
342
+ )
343
+ parser.add_argument(
344
+ "--ksteering-alpha", type=float, default=50.0, help="Alpha parameter (step size) for K-steering"
345
+ )
346
+
347
+ # Token steering arguments
348
+ parser.add_argument("--enable-token-steering", action="store_true", help="Enable token-level steering control")
349
+ parser.add_argument(
350
+ "--token-steering-strategy",
351
+ type=str,
352
+ default="last_only",
353
+ choices=[
354
+ "last_only",
355
+ "first_only",
356
+ "all_equal",
357
+ "exponential_decay",
358
+ "exponential_growth",
359
+ "linear_decay",
360
+ "linear_growth",
361
+ "custom",
362
+ ],
363
+ help="Token steering strategy (default: last_only)",
364
+ )
365
+ parser.add_argument(
366
+ "--token-decay-rate",
367
+ type=float,
368
+ default=0.5,
369
+ help="Decay rate for exponential token steering strategies (0-1, default: 0.5)",
370
+ )
371
+ parser.add_argument(
372
+ "--token-min-strength",
373
+ type=float,
374
+ default=0.1,
375
+ help="Minimum steering strength for token strategies (default: 0.1)",
376
+ )
377
+ parser.add_argument(
378
+ "--token-max-strength",
379
+ type=float,
380
+ default=1.0,
381
+ help="Maximum steering strength for token strategies (default: 1.0)",
382
+ )
383
+ parser.add_argument(
384
+ "--token-apply-to-prompt",
385
+ action="store_true",
386
+ help="Apply steering to prompt tokens as well as generated tokens",
387
+ )
388
+ parser.add_argument(
389
+ "--token-prompt-strength-multiplier",
390
+ type=float,
391
+ default=0.1,
392
+ help="Strength multiplier for prompt tokens (default: 0.1)",
393
+ )
394
+
395
+ # Training/Inference mode arguments
396
+ parser.add_argument(
397
+ "--train-only",
398
+ action="store_true",
399
+ help="Training-only mode: train classifiers/vectors and save them, skip inference",
400
+ )
401
+ parser.add_argument(
402
+ "--inference-only",
403
+ action="store_true",
404
+ help="Inference-only mode: load pre-trained classifiers/vectors and use for monitoring/steering",
405
+ )
406
+ parser.add_argument(
407
+ "--save-classifier",
408
+ type=str,
409
+ default=None,
410
+ help="Path to save trained classifier(s). In multi-layer mode, saves one file per layer with layer suffix",
411
+ )
412
+ parser.add_argument(
413
+ "--load-classifier",
414
+ type=str,
415
+ default=None,
416
+ help="Path to load pre-trained classifier(s). In multi-layer mode, expects files with layer suffix",
417
+ )
418
+ parser.add_argument(
419
+ "--classifier-dir",
420
+ type=str,
421
+ default="./models",
422
+ help="Directory for saving/loading classifiers and vectors (default: ./models)",
423
+ )
424
+
425
+ # Prompt construction and token targeting strategy arguments
426
+ parser.add_argument(
427
+ "--prompt-construction-strategy",
428
+ type=str,
429
+ choices=["multiple_choice", "role_playing", "direct_completion", "instruction_following"],
430
+ default="multiple_choice",
431
+ help="Strategy for constructing prompts from question-answer pairs (default: multiple_choice)",
432
+ )
433
+ parser.add_argument(
434
+ "--token-targeting-strategy",
435
+ type=str,
436
+ choices=["choice_token", "continuation_token", "last_token", "first_token", "mean_pooling", "max_pooling"],
437
+ default="choice_token",
438
+ help="Strategy for targeting tokens during activation extraction (default: choice_token)",
439
+ )
440
+
441
+ # Normalization options
442
+ parser.add_argument("--normalize-mode", action="store_true", help="Enable normalization mode (legacy flag)")
443
+ parser.add_argument(
444
+ "--normalization-method",
445
+ type=str,
446
+ default="none",
447
+ choices=["none", "l2_unit", "cross_behavior", "layer_wise_mean"],
448
+ help="Vector normalization method to apply",
449
+ )
450
+ parser.add_argument("--target-norm", type=float, default=None, help="Target norm for certain normalization methods")
451
+
452
+ # Nonsense detection options
453
+ parser.add_argument(
454
+ "--enable-nonsense-detection",
455
+ action="store_true",
456
+ help="Enable nonsense detection to stop lobotomized responses",
457
+ )
458
+ parser.add_argument(
459
+ "--max-word-length",
460
+ type=int,
461
+ default=20,
462
+ help="Maximum reasonable word length for nonsense detection (default: 20)",
463
+ )
464
+ parser.add_argument(
465
+ "--repetition-threshold",
466
+ type=float,
467
+ default=0.7,
468
+ help="Threshold for repetitive content detection (0-1, default: 0.7)",
469
+ )
470
+ parser.add_argument(
471
+ "--gibberish-threshold",
472
+ type=float,
473
+ default=0.3,
474
+ help="Threshold for gibberish word detection (0-1, default: 0.3)",
475
+ )
476
+ parser.add_argument(
477
+ "--disable-dictionary-check",
478
+ action="store_true",
479
+ help="Disable dictionary-based word validation (faster but less accurate)",
480
+ )
481
+ parser.add_argument(
482
+ "--nonsense-action",
483
+ type=str,
484
+ default="regenerate",
485
+ choices=["regenerate", "stop", "flag"],
486
+ help="Action when nonsense is detected: regenerate, stop generation, or flag for review",
487
+ )
488
+
489
+ # Performance monitoring options
490
+ parser.add_argument(
491
+ "--enable-memory-tracking", action="store_true", help="Enable memory usage tracking and reporting"
492
+ )
493
+ parser.add_argument(
494
+ "--enable-latency-tracking", action="store_true", help="Enable latency/timing tracking and reporting"
495
+ )
496
+ parser.add_argument(
497
+ "--memory-sampling-interval", type=float, default=0.1, help="Memory sampling interval in seconds (default: 0.1)"
498
+ )
499
+ parser.add_argument("--track-gpu-memory", action="store_true", help="Track GPU memory usage (requires CUDA)")
500
+ parser.add_argument(
501
+ "--detailed-performance-report",
502
+ action="store_true",
503
+ help="Generate detailed performance report with all metrics",
504
+ )
505
+ parser.add_argument("--export-performance-csv", type=str, default=None, help="Export performance data to CSV file")
506
+ parser.add_argument(
507
+ "--show-memory-usage", action="store_true", help="Show current memory usage without full tracking"
508
+ )
509
+ parser.add_argument("--show-timing-summary", action="store_true", help="Show timing summary after evaluation")
510
+
511
+ # Test-time activation saving/loading options
512
+ parser.add_argument(
513
+ "--save-test-activations", type=str, default=None, help="Save test activations to file for future use"
514
+ )
515
+ parser.add_argument(
516
+ "--load-test-activations", type=str, default=None, help="Load test activations from file instead of computing"
517
+ )
518
+
519
+ # Priority-aware benchmark selection options
520
+ parser.add_argument(
521
+ "--priority",
522
+ type=str,
523
+ default="all",
524
+ choices=["all", "high", "medium", "low"],
525
+ help="Priority level for benchmark selection (default: all)",
526
+ )
527
+ parser.add_argument(
528
+ "--fast-only", action="store_true", help="Only use fast benchmarks (high priority, < 13.5s loading time)"
529
+ )
530
+ parser.add_argument(
531
+ "--time-budget",
532
+ type=float,
533
+ default=None,
534
+ help="Time budget in minutes for benchmark selection (auto-selects fast benchmarks)",
535
+ )
536
+ parser.add_argument(
537
+ "--max-benchmarks",
538
+ type=int,
539
+ default=None,
540
+ help="Maximum number of benchmarks to select (combines with priority filtering)",
541
+ )
542
+ parser.add_argument(
543
+ "--smart-selection", action="store_true", help="Use smart benchmark selection based on relevance and priority"
544
+ )
545
+ parser.add_argument(
546
+ "--prefer-fast",
547
+ action="store_true",
548
+ help="Prefer fast benchmarks in selection when multiple options are available",
549
+ )
550
+
551
+ parser.add_argument(
552
+ "--save-steering-vector", type=str, default=None, help="Path to save the computed steering vector"
553
+ )
554
+ parser.add_argument(
555
+ "--load-steering-vector", type=str, default=None, help="Path to load a pre-computed steering vector"
556
+ )
557
+
558
+ # Additional output options
559
+ parser.add_argument("--csv-output", type=str, default=None, help="Path to save results in CSV format")
560
+ parser.add_argument("--evaluation-report", type=str, default=None, help="Path to save evaluation report")
561
+ parser.add_argument("--continue-on-error", action="store_true", help="Continue processing other tasks if one fails")
562
+
563
+ # Benchmark caching arguments
564
+ parser.add_argument(
565
+ "--cache-benchmark",
566
+ action="store_true",
567
+ default=True,
568
+ help="Cache the benchmark data locally for faster future access (default: True)",
569
+ )
570
+ parser.add_argument("--no-cache", dest="cache_benchmark", action="store_false", help="Disable benchmark caching")
571
+ parser.add_argument(
572
+ "--use-cached", action="store_true", default=True, help="Use cached benchmark data if available (default: True)"
573
+ )
574
+ parser.add_argument(
575
+ "--force-download", action="store_true", help="Force fresh download even if cached version exists"
576
+ )
577
+ parser.add_argument(
578
+ "--cache-dir",
579
+ type=str,
580
+ default="./benchmark_cache",
581
+ help="Directory to store cached benchmark data (default: ./benchmark_cache)",
582
+ )
583
+ parser.add_argument("--cache-status", action="store_true", help="Show cache status and exit")
584
+ parser.add_argument("--cleanup-cache", type=int, metavar="DAYS", help="Clean up cache entries older than DAYS days")
@@ -0,0 +1,26 @@
1
+ """Parser setup for the 'test-nonsense' command."""
2
+
3
+
4
+ def setup_test_nonsense_parser(parser):
5
+ """Set up the test-nonsense subcommand parser."""
6
+ parser.add_argument(
7
+ "text", type=str, nargs="?", help="Text to analyze (if not provided, will use interactive mode)"
8
+ )
9
+ parser.add_argument("--max-word-length", type=int, default=20, help="Maximum reasonable word length (default: 20)")
10
+ parser.add_argument(
11
+ "--repetition-threshold",
12
+ type=float,
13
+ default=0.7,
14
+ help="Threshold for repetitive content detection (0-1, default: 0.7)",
15
+ )
16
+ parser.add_argument(
17
+ "--gibberish-threshold",
18
+ type=float,
19
+ default=0.3,
20
+ help="Threshold for gibberish word detection (0-1, default: 0.3)",
21
+ )
22
+ parser.add_argument(
23
+ "--disable-dictionary-check", action="store_true", help="Disable dictionary-based word validation"
24
+ )
25
+ parser.add_argument("--verbose", action="store_true", help="Show detailed analysis")
26
+ parser.add_argument("--examples", action="store_true", help="Test with built-in example texts")