wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +22 -6
- wisent/core/activations/activations.py +21 -39
- wisent/core/activations/activations_collector.py +141 -373
- wisent/core/activations/classifier_inference_strategy.py +194 -0
- wisent/core/activations/core/atoms.py +8 -92
- wisent/core/activations/extraction_strategy.py +308 -0
- wisent/core/agent/diagnose/response_diagnostics.py +3 -3
- wisent/core/agent/diagnose.py +3 -3
- wisent/core/autonomous_agent.py +2 -2
- wisent/core/cli/agent/apply_steering.py +23 -27
- wisent/core/cli/agent/evaluate_response.py +18 -20
- wisent/core/cli/agent/train_classifier.py +18 -20
- wisent/core/cli/cluster_benchmarks.py +472 -0
- wisent/core/cli/create_steering_vector.py +13 -5
- wisent/core/cli/generate_vector_from_task.py +4 -0
- wisent/core/cli/get_activations.py +12 -36
- wisent/core/cli/method_optimizer.py +859 -0
- wisent/core/cli/optimize.py +44 -5
- wisent/core/cli/optimize_classification.py +5 -6
- wisent/core/cli/optimize_sample_size.py +8 -22
- wisent/core/cli/optimize_steering.py +429 -153
- wisent/core/cli/optimize_weights.py +65 -6
- wisent/core/cli/steering_method_trainer.py +5 -4
- wisent/core/cli/steering_search_space.py +20 -15
- wisent/core/cli/tasks.py +14 -43
- wisent/core/cli/train_unified_goodness.py +17 -18
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
- wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
- wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
- wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
- wisent/core/evaluators/personalization/coherence.py +46 -0
- wisent/core/hyperparameter_optimizer.py +13 -13
- wisent/core/lm_eval_harness_ground_truth.py +7 -11
- wisent/core/main.py +3 -0
- wisent/core/models/wisent_model.py +8 -7
- wisent/core/opti/methods/opti_weights.py +29 -2
- wisent/core/optuna/classifier/activation_generator.py +14 -12
- wisent/core/optuna/steering/steering_optimization.py +14 -9
- wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
- wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
- wisent/core/parser_arguments/tasks_parser.py +7 -19
- wisent/core/steering_methods/core/atoms.py +1 -2
- wisent/core/steering_methods/methods/caa.py +1 -1
- wisent/core/steering_methods/methods/hyperplane.py +74 -0
- wisent/core/steering_methods/methods/prism.py +1 -2
- wisent/core/steering_methods/methods/pulse.py +39 -8
- wisent/core/steering_methods/methods/titan.py +59 -14
- wisent/core/steering_methods/registry.py +52 -12
- wisent/core/steering_optimizer.py +15 -15
- wisent/core/trainers/steering_trainer.py +9 -18
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
- wisent/scripts/run_quality_metrics_sweep.sh +22 -27
- wisent/tests/test_aggregation_geometry.py +236 -0
- wisent/tests/test_detector_accuracy.py +163 -0
- wisent/tests/test_geometry_exhaustive.py +1202 -0
- wisent/tests/visualize_geometry.py +255 -61
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
- wisent/core/activations/prompt_construction_strategy.py +0 -47
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
- wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
- wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
- wisent/examples/scripts/results/benchmark_tags.json +0 -917
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
- wisent/examples/scripts/results/failing_benchmarks.json +0 -946
- wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
- wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
- wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
- wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
- wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
- wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
- wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
- wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
- wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anli_pairs.json +0 -8
- wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
- wisent/examples/scripts/results/test_apps_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
- wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
- wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
- wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/results/test_atis_pairs.json +0 -8
- wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babi_pairs.json +0 -8
- wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
- wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
- wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
- wisent/examples/scripts/results/test_c4_pairs.json +0 -8
- wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cb_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
- wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cola_pairs.json +0 -8
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
- wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
- wisent/examples/scripts/results/test_conala_pairs.json +0 -8
- wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_concode_pairs.json +0 -8
- wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copa_pairs.json +0 -8
- wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
- wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
- wisent/examples/scripts/results/test_drop_pairs.json +0 -8
- wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_escola_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fda_pairs.json +0 -8
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
- wisent/examples/scripts/results/test_flores_pairs.json +0 -14
- wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
- wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
- wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
- wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
- wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
- wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
- wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
- wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
- wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
- wisent/examples/scripts/results/test_llama_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math500_pairs.json +0 -8
- wisent/examples/scripts/results/test_math_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math_pairs.json +0 -8
- wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
- wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mela_pairs.json +0 -14
- wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
- wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
- wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
- wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
- wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
- wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
- wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
- wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
- wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
- wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
- wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
- wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
- wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
- wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prost_pairs.json +0 -8
- wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
- wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
- wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
- wisent/examples/scripts/results/test_race_evaluation.json +0 -30
- wisent/examples/scripts/results/test_race_pairs.json +0 -8
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
- wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_recode_pairs.json +0 -8
- wisent/examples/scripts/results/test_record_evaluation.json +0 -30
- wisent/examples/scripts/results/test_record_pairs.json +0 -8
- wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
- wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
- wisent/examples/scripts/results/test_score_evaluation.json +0 -51
- wisent/examples/scripts/results/test_score_pairs.json +0 -14
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
- wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
- wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_swag_pairs.json +0 -8
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
- wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
- wisent/examples/scripts/results/test_translation_pairs.json +0 -14
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
- wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
- wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
- wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
- wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
- wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
- wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
- wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
- wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
- wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
- wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
- wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
- wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
|
@@ -19,6 +19,47 @@ __all__ = ["evaluate_quality"]
|
|
|
19
19
|
# Global tokenizer cache
|
|
20
20
|
_tokenizer_cache = {}
|
|
21
21
|
|
|
22
|
+
# Function words - the glue words of English that appear in natural text
|
|
23
|
+
# Real sentences need these; gibberish often lacks them
|
|
24
|
+
FUNCTION_WORDS = {
|
|
25
|
+
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
26
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
27
|
+
"should", "may", "might", "must", "shall", "can", "need", "dare",
|
|
28
|
+
"ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
|
|
29
|
+
"from", "as", "into", "through", "during", "before", "after", "above",
|
|
30
|
+
"below", "between", "under", "again", "further", "then", "once",
|
|
31
|
+
"and", "but", "or", "nor", "so", "yet", "both", "either", "neither",
|
|
32
|
+
"not", "only", "own", "same", "than", "too", "very", "just", "also",
|
|
33
|
+
"now", "here", "there", "when", "where", "why", "how", "all", "each",
|
|
34
|
+
"every", "few", "more", "most", "other", "some", "such", "no",
|
|
35
|
+
"any", "i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
|
|
36
|
+
"us", "them", "my", "your", "his", "its", "our", "their", "this", "that",
|
|
37
|
+
"these", "those", "what", "which", "who", "whom", "whose",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _has_low_function_word_ratio(text: str, threshold: float = 0.15) -> bool:
|
|
42
|
+
"""Check if text has suspiciously low ratio of function words.
|
|
43
|
+
|
|
44
|
+
Natural English text typically has 30-50% function words.
|
|
45
|
+
Gibberish made of strung-together nouns/jargon has very few.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
text: Text to check
|
|
49
|
+
threshold: Minimum ratio of function words (default 0.15)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
True if text has too few function words (likely gibberish)
|
|
53
|
+
"""
|
|
54
|
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
|
55
|
+
if len(tokens) < 6:
|
|
56
|
+
return False # Too short to judge
|
|
57
|
+
|
|
58
|
+
function_count = sum(1 for t in tokens if t in FUNCTION_WORDS)
|
|
59
|
+
ratio = function_count / len(tokens)
|
|
60
|
+
|
|
61
|
+
return ratio < threshold
|
|
62
|
+
|
|
22
63
|
|
|
23
64
|
def _get_tokenizer():
|
|
24
65
|
"""Get a cached tokenizer for nonsense word detection."""
|
|
@@ -137,6 +178,11 @@ def _is_gibberish(text: str) -> bool:
|
|
|
137
178
|
if validity_ratio < 0.3:
|
|
138
179
|
return True
|
|
139
180
|
|
|
181
|
+
# Check 6: Function word ratio - real English has ~30-50% function words
|
|
182
|
+
# Gibberish made of strung-together nouns/jargon has very few
|
|
183
|
+
if _has_low_function_word_ratio(text, threshold=0.15):
|
|
184
|
+
return True
|
|
185
|
+
|
|
140
186
|
return False
|
|
141
187
|
|
|
142
188
|
|
|
@@ -9,8 +9,8 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc
|
|
|
9
9
|
from .contrastive_pairs import ContrastivePairSet
|
|
10
10
|
from .steering import SteeringMethod, SteeringType
|
|
11
11
|
from .activations.activations_collector import ActivationCollector
|
|
12
|
-
from .activations.
|
|
13
|
-
|
|
12
|
+
from .activations.extraction_strategy import ExtractionStrategy
|
|
13
|
+
|
|
14
14
|
from wisent.core.errors import OptimizationError, NoActivationDataError, InsufficientDataError
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
@@ -352,22 +352,22 @@ class HyperparameterOptimizer:
|
|
|
352
352
|
|
|
353
353
|
# Map aggregation string to enum
|
|
354
354
|
aggregation_map = {
|
|
355
|
-
'average':
|
|
356
|
-
'first':
|
|
357
|
-
'last':
|
|
358
|
-
'max':
|
|
355
|
+
'average': ExtractionStrategy.CHAT_MEAN,
|
|
356
|
+
'first': ExtractionStrategy.CHAT_FIRST,
|
|
357
|
+
'last': ExtractionStrategy.CHAT_LAST,
|
|
358
|
+
'max': ExtractionStrategy.CHAT_MAX_NORM,
|
|
359
359
|
}
|
|
360
|
-
agg_strategy = aggregation_map.get(aggregation,
|
|
360
|
+
agg_strategy = aggregation_map.get(aggregation, ExtractionStrategy.CHAT_MEAN)
|
|
361
361
|
|
|
362
362
|
# Map prompt strategy string to enum
|
|
363
363
|
prompt_strategy_map = {
|
|
364
|
-
'multiple_choice':
|
|
365
|
-
'role_playing':
|
|
366
|
-
'direct_completion':
|
|
367
|
-
'instruction_following':
|
|
368
|
-
'chat_template':
|
|
364
|
+
'multiple_choice': ExtractionStrategy.MC_BALANCED,
|
|
365
|
+
'role_playing': ExtractionStrategy.ROLE_PLAY,
|
|
366
|
+
'direct_completion': ExtractionStrategy.CHAT_LAST,
|
|
367
|
+
'instruction_following': ExtractionStrategy.CHAT_LAST,
|
|
368
|
+
'chat_template': ExtractionStrategy.CHAT_LAST,
|
|
369
369
|
}
|
|
370
|
-
prompt_strategy = prompt_strategy_map.get(prompt_construction_strategy,
|
|
370
|
+
prompt_strategy = prompt_strategy_map.get(prompt_construction_strategy, ExtractionStrategy.CHAT_LAST)
|
|
371
371
|
|
|
372
372
|
# Create activation collector
|
|
373
373
|
collector = ActivationCollector(model=model, store_device="cpu")
|
|
@@ -10,7 +10,7 @@ Evaluation uses the TEST portion (20%) to ensure no data leakage with training.
|
|
|
10
10
|
import logging
|
|
11
11
|
from typing import Any, Dict
|
|
12
12
|
|
|
13
|
-
from wisent.core.activations.
|
|
13
|
+
from wisent.core.activations.extraction_strategy import ExtractionStrategy
|
|
14
14
|
from wisent.core.activations.activations import Activations
|
|
15
15
|
from wisent.core.layer import Layer
|
|
16
16
|
from wisent.core.utils.dataset_splits import get_all_docs_from_task, create_deterministic_split
|
|
@@ -674,16 +674,12 @@ class LMEvalHarnessGroundTruth:
|
|
|
674
674
|
}
|
|
675
675
|
|
|
676
676
|
def _map_token_aggregation_to_activation_method(self, token_aggregation: str):
|
|
677
|
-
"""Map token aggregation string to
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
"max": ActivationAggregationStrategy.MAX_POOLING,
|
|
684
|
-
}
|
|
685
|
-
|
|
686
|
-
return mapping.get(token_aggregation.lower(), ActivationAggregationStrategy.MEAN_POOLING)
|
|
677
|
+
"""Map token aggregation string to ExtractionStrategy."""
|
|
678
|
+
# Use ExtractionStrategy directly - token_aggregation should already be a valid enum value
|
|
679
|
+
try:
|
|
680
|
+
return ExtractionStrategy(token_aggregation)
|
|
681
|
+
except ValueError:
|
|
682
|
+
return ExtractionStrategy.CHAT_LAST
|
|
687
683
|
|
|
688
684
|
def _is_task_interface_task(self, task_name: str) -> bool:
|
|
689
685
|
"""Check if this is a TaskInterface task (not an lm-eval task)."""
|
wisent/core/main.py
CHANGED
|
@@ -12,6 +12,7 @@ from wisent.core.branding import print_banner
|
|
|
12
12
|
from wisent.core.cli import execute_tasks, execute_generate_pairs_from_task, execute_generate_pairs, execute_diagnose_pairs, execute_get_activations, execute_diagnose_vectors, execute_create_steering_vector, execute_generate_vector_from_task, execute_generate_vector_from_synthetic, execute_optimize_classification, execute_optimize_steering, execute_optimize_sample_size, execute_generate_responses, execute_evaluate_responses, execute_multi_steer, execute_agent, execute_modify_weights, execute_evaluate_refusal, execute_inference_config, execute_optimization_cache, execute_optimize_weights, execute_optimize
|
|
13
13
|
from wisent.core.cli.train_unified_goodness import execute_train_unified_goodness
|
|
14
14
|
from wisent.core.cli.check_linearity import execute_check_linearity
|
|
15
|
+
from wisent.core.cli.cluster_benchmarks import execute_cluster_benchmarks
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def _should_show_banner() -> bool:
|
|
@@ -92,6 +93,8 @@ def main():
|
|
|
92
93
|
execute_train_unified_goodness(args)
|
|
93
94
|
elif args.command == 'check-linearity':
|
|
94
95
|
execute_check_linearity(args)
|
|
96
|
+
elif args.command == 'cluster-benchmarks':
|
|
97
|
+
execute_cluster_benchmarks(args)
|
|
95
98
|
else:
|
|
96
99
|
print(f"\n✗ Command '{args.command}' is not yet implemented")
|
|
97
100
|
sys.exit(1)
|
|
@@ -154,12 +154,13 @@ class WisentModel:
|
|
|
154
154
|
layers: list[nn.Module] = []
|
|
155
155
|
|
|
156
156
|
candidates = [
|
|
157
|
-
"layers",
|
|
158
|
-
"model.layers",
|
|
159
|
-
"model.decoder.layers",
|
|
160
|
-
"transformer.h",
|
|
157
|
+
"layers",
|
|
158
|
+
"model.layers",
|
|
159
|
+
"model.decoder.layers",
|
|
160
|
+
"transformer.h",
|
|
161
161
|
"base_model.model.layers",
|
|
162
|
-
"blocks", "model.blocks",
|
|
162
|
+
"blocks", "model.blocks",
|
|
163
|
+
"gpt_neox.layers", # Pythia models
|
|
163
164
|
]
|
|
164
165
|
for path in candidates:
|
|
165
166
|
obj = m
|
|
@@ -516,7 +517,7 @@ class WisentModel:
|
|
|
516
517
|
return_tensors="pt",
|
|
517
518
|
padding=False, # Single prompt, no padding needed
|
|
518
519
|
truncation=True, # Avoid errors on long inputs
|
|
519
|
-
max_length=self.tokenizer.model_max_length # Use model's actual limit
|
|
520
|
+
max_length=self.tokenizer.model_max_length, # Use model's actual limit
|
|
520
521
|
)
|
|
521
522
|
# Move tensors to the correct device (same as _batch_encode does)
|
|
522
523
|
batch = {
|
|
@@ -792,7 +793,7 @@ class WisentModel:
|
|
|
792
793
|
return_tensors="pt",
|
|
793
794
|
padding=False, # Single prompt, no padding needed
|
|
794
795
|
truncation=True, # Avoid errors on long inputs
|
|
795
|
-
max_length=self.tokenizer.model_max_length # Use model's actual limit
|
|
796
|
+
max_length=self.tokenizer.model_max_length, # Use model's actual limit
|
|
796
797
|
)
|
|
797
798
|
# Move tensors to the correct device (same as _batch_encode does)
|
|
798
799
|
batch = {
|
|
@@ -240,11 +240,14 @@ class WeightsOptimizer(BaseOptimizer):
|
|
|
240
240
|
norm_preserve=self.config.norm_preserve,
|
|
241
241
|
verbose=False,
|
|
242
242
|
)
|
|
243
|
-
elif self.config.method
|
|
243
|
+
elif self.config.method in ("additive", "titan", "prism", "pulse"):
|
|
244
244
|
# Direct additive: add steering vector directly to weight matrices
|
|
245
|
-
# This
|
|
245
|
+
# This modifies weights directly, not biases, so it persists when saved
|
|
246
|
+
# Used for additive and multi-direction methods (titan/prism/pulse)
|
|
246
247
|
self._apply_direct_additive(params)
|
|
247
248
|
else:
|
|
249
|
+
# Default fallback - use bake_steering_with_kernel
|
|
250
|
+
# Note: This adds biases which may not load correctly for some architectures
|
|
248
251
|
bake_steering_with_kernel(
|
|
249
252
|
self.model,
|
|
250
253
|
self.steering_vectors,
|
|
@@ -376,6 +379,8 @@ class WeightsOptimizer(BaseOptimizer):
|
|
|
376
379
|
checkpoint_interval: int = 5,
|
|
377
380
|
output_dir: str | None = None,
|
|
378
381
|
tokenizer: Any = None,
|
|
382
|
+
s3_bucket: str | None = None,
|
|
383
|
+
s3_key_prefix: str | None = None,
|
|
379
384
|
) -> HPORun:
|
|
380
385
|
"""
|
|
381
386
|
Run optimization with checkpointing support.
|
|
@@ -436,11 +441,20 @@ class WeightsOptimizer(BaseOptimizer):
|
|
|
436
441
|
if checkpoint_path and trial_num % checkpoint_interval == 0:
|
|
437
442
|
self._save_checkpoint(study, checkpoint_path)
|
|
438
443
|
print(f" [Checkpoint saved at trial {trial_num}]")
|
|
444
|
+
|
|
445
|
+
# Upload checkpoint to S3
|
|
446
|
+
if s3_bucket and s3_key_prefix:
|
|
447
|
+
self._upload_to_s3(checkpoint_path, s3_bucket, f"{s3_key_prefix}/checkpoint.json")
|
|
439
448
|
|
|
440
449
|
# Save best model at intervals
|
|
441
450
|
if output_dir and trial_num % checkpoint_interval == 0:
|
|
442
451
|
if study.best_trial is not None:
|
|
443
452
|
self._save_best_model_checkpoint(study, output_dir, tokenizer)
|
|
453
|
+
|
|
454
|
+
# Upload best model checkpoint to S3
|
|
455
|
+
if s3_bucket and s3_key_prefix:
|
|
456
|
+
checkpoint_dir = os.path.join(output_dir, "checkpoint_best")
|
|
457
|
+
self._upload_to_s3(checkpoint_dir, s3_bucket, f"{s3_key_prefix}/checkpoint_best/")
|
|
444
458
|
|
|
445
459
|
# Run optimization with callback
|
|
446
460
|
study.optimize(
|
|
@@ -521,3 +535,16 @@ class WeightsOptimizer(BaseOptimizer):
|
|
|
521
535
|
}
|
|
522
536
|
with open(os.path.join(checkpoint_dir, "checkpoint_metadata.json"), "w") as f:
|
|
523
537
|
json.dump(metadata, f, indent=2)
|
|
538
|
+
|
|
539
|
+
def _upload_to_s3(self, local_path: str, s3_bucket: str, s3_key: str) -> bool:
|
|
540
|
+
"""Upload a file or directory to S3."""
|
|
541
|
+
import subprocess
|
|
542
|
+
try:
|
|
543
|
+
if os.path.isdir(local_path):
|
|
544
|
+
cmd = ["aws", "s3", "sync", local_path, f"s3://{s3_bucket}/{s3_key}", "--quiet"]
|
|
545
|
+
else:
|
|
546
|
+
cmd = ["aws", "s3", "cp", local_path, f"s3://{s3_bucket}/{s3_key}", "--quiet"]
|
|
547
|
+
subprocess.run(cmd, check=True, capture_output=True)
|
|
548
|
+
return True
|
|
549
|
+
except Exception:
|
|
550
|
+
return False
|
|
@@ -16,7 +16,7 @@ import numpy as np
|
|
|
16
16
|
import torch
|
|
17
17
|
|
|
18
18
|
from wisent.core.activations.activations_collector import ActivationCollector
|
|
19
|
-
from wisent.core.activations.
|
|
19
|
+
from wisent.core.activations.extraction_strategy import ExtractionStrategy
|
|
20
20
|
from wisent.core.activations.activations import Activations
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
@@ -29,7 +29,7 @@ class ActivationData:
|
|
|
29
29
|
activations: torch.Tensor
|
|
30
30
|
labels: torch.Tensor
|
|
31
31
|
layer: int
|
|
32
|
-
aggregation:
|
|
32
|
+
aggregation: ExtractionStrategy
|
|
33
33
|
metadata: dict[str, Any]
|
|
34
34
|
|
|
35
35
|
def to_numpy(self) -> tuple[np.ndarray, np.ndarray]:
|
|
@@ -102,7 +102,7 @@ class GenerationConfig:
|
|
|
102
102
|
"""Configuration for activation generation."""
|
|
103
103
|
|
|
104
104
|
layer_search_range: tuple[int, int]
|
|
105
|
-
aggregation_methods: Optional[list[
|
|
105
|
+
aggregation_methods: Optional[list[ExtractionStrategy]] = None
|
|
106
106
|
cache_dir: Optional[str] = None
|
|
107
107
|
device: Optional[str] = None
|
|
108
108
|
dtype: Optional[torch.dtype] = None # Auto-detect if None
|
|
@@ -113,10 +113,10 @@ class GenerationConfig:
|
|
|
113
113
|
self.cache_dir = "./activation_cache"
|
|
114
114
|
if not self.aggregation_methods:
|
|
115
115
|
self.aggregation_methods = [
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
116
|
+
ExtractionStrategy.CHAT_MEAN,
|
|
117
|
+
ExtractionStrategy.CHAT_LAST,
|
|
118
|
+
ExtractionStrategy.CHAT_FIRST,
|
|
119
|
+
ExtractionStrategy.CHAT_MAX_NORM,
|
|
120
120
|
]
|
|
121
121
|
|
|
122
122
|
|
|
@@ -239,7 +239,7 @@ class ActivationGenerator:
|
|
|
239
239
|
return activation_data
|
|
240
240
|
|
|
241
241
|
def _apply_batch_aggregation(
|
|
242
|
-
self, activations: torch.Tensor, strategy:
|
|
242
|
+
self, activations: torch.Tensor, strategy: ExtractionStrategy
|
|
243
243
|
) -> torch.Tensor:
|
|
244
244
|
"""
|
|
245
245
|
Apply aggregation strategy to a batch of activations efficiently.
|
|
@@ -258,14 +258,16 @@ class ActivationGenerator:
|
|
|
258
258
|
return activations
|
|
259
259
|
if len(activations.shape) == 3:
|
|
260
260
|
# [n_samples, n_tokens, hidden_dim] -> [n_samples, hidden_dim]
|
|
261
|
-
if strategy ==
|
|
261
|
+
if strategy == ExtractionStrategy.CHAT_MEAN:
|
|
262
262
|
return torch.mean(activations, dim=1)
|
|
263
|
-
if strategy ==
|
|
263
|
+
if strategy == ExtractionStrategy.CHAT_LAST:
|
|
264
264
|
return activations[:, -1, :]
|
|
265
|
-
if strategy ==
|
|
265
|
+
if strategy == ExtractionStrategy.CHAT_FIRST:
|
|
266
266
|
return activations[:, 0, :]
|
|
267
|
-
if strategy ==
|
|
267
|
+
if strategy == ExtractionStrategy.CHAT_MAX_NORM:
|
|
268
268
|
return torch.max(activations, dim=1)[0]
|
|
269
|
+
if strategy == ExtractionStrategy.CHAT_MEAN:
|
|
270
|
+
return torch.min(activations, dim=1)[0]
|
|
269
271
|
# Default to mean pooling
|
|
270
272
|
self.logger.warning(f"Unknown aggregation strategy {strategy}, using mean pooling")
|
|
271
273
|
return torch.mean(activations, dim=1)
|
|
@@ -14,7 +14,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
14
14
|
import torch
|
|
15
15
|
from tqdm import tqdm
|
|
16
16
|
|
|
17
|
-
from wisent.core.activations
|
|
17
|
+
from wisent.core.activations import ExtractionStrategy
|
|
18
18
|
from wisent.core.classifier.classifier import Classifier
|
|
19
19
|
from wisent.core.contrastive_pairs.contrastive_pair import ContrastivePair
|
|
20
20
|
from wisent.core.contrastive_pairs.contrastive_pair_set import ContrastivePairSet
|
|
@@ -743,24 +743,29 @@ class SteeringOptimizer:
|
|
|
743
743
|
# Apply aggregation strategy
|
|
744
744
|
if (
|
|
745
745
|
aggregation_strategy == "mean_pooling"
|
|
746
|
-
or aggregation_strategy ==
|
|
746
|
+
or aggregation_strategy == ExtractionStrategy.CHAT_MEAN.value
|
|
747
747
|
):
|
|
748
748
|
aggregated = torch.mean(activation_tensor, dim=1) # [1, hidden_dim]
|
|
749
749
|
elif (
|
|
750
750
|
aggregation_strategy == "last_token"
|
|
751
|
-
or aggregation_strategy ==
|
|
751
|
+
or aggregation_strategy == ExtractionStrategy.CHAT_LAST.value
|
|
752
752
|
):
|
|
753
753
|
aggregated = activation_tensor[:, -1, :] # [1, hidden_dim]
|
|
754
754
|
elif (
|
|
755
755
|
aggregation_strategy == "first_token"
|
|
756
|
-
or aggregation_strategy ==
|
|
756
|
+
or aggregation_strategy == ExtractionStrategy.CHAT_FIRST.value
|
|
757
757
|
):
|
|
758
758
|
aggregated = activation_tensor[:, 0, :] # [1, hidden_dim]
|
|
759
759
|
elif (
|
|
760
760
|
aggregation_strategy == "max_pooling"
|
|
761
|
-
or aggregation_strategy ==
|
|
761
|
+
or aggregation_strategy == ExtractionStrategy.CHAT_MAX_NORM.value
|
|
762
762
|
):
|
|
763
763
|
aggregated = torch.max(activation_tensor, dim=1)[0] # [1, hidden_dim]
|
|
764
|
+
elif (
|
|
765
|
+
aggregation_strategy == "min_pooling"
|
|
766
|
+
or aggregation_strategy == ExtractionStrategy.CHAT_MEAN.value
|
|
767
|
+
):
|
|
768
|
+
aggregated = torch.min(activation_tensor, dim=1)[0] # [1, hidden_dim]
|
|
764
769
|
else:
|
|
765
770
|
# Default to mean pooling if unknown
|
|
766
771
|
self.logger.warning(f"Unknown aggregation strategy {aggregation_strategy}, using mean pooling")
|
|
@@ -1029,10 +1034,10 @@ class SteeringOptimizer:
|
|
|
1029
1034
|
generation_config = GenerationConfig(
|
|
1030
1035
|
layer_search_range=(0, 23), # Will be auto-detected from model
|
|
1031
1036
|
aggregation_methods=[
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1037
|
+
ExtractionStrategy.CHAT_MEAN,
|
|
1038
|
+
ExtractionStrategy.CHAT_LAST,
|
|
1039
|
+
ExtractionStrategy.CHAT_FIRST,
|
|
1040
|
+
ExtractionStrategy.CHAT_MAX_NORM,
|
|
1036
1041
|
],
|
|
1037
1042
|
cache_dir="./cache/steering_activations",
|
|
1038
1043
|
device=optimization_config.device,
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Parser for cluster-benchmarks command."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def setup_cluster_benchmarks_parser(parser: argparse.ArgumentParser) -> None:
|
|
7
|
+
"""Set up arguments for the cluster-benchmarks command."""
|
|
8
|
+
parser.add_argument(
|
|
9
|
+
"--model",
|
|
10
|
+
type=str,
|
|
11
|
+
required=True,
|
|
12
|
+
help="Model name or path (e.g., meta-llama/Llama-3.2-1B-Instruct)"
|
|
13
|
+
)
|
|
14
|
+
parser.add_argument(
|
|
15
|
+
"--output",
|
|
16
|
+
type=str,
|
|
17
|
+
default="./cluster_output",
|
|
18
|
+
help="Output directory for results (default: ./cluster_output)"
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"--pairs-per-benchmark",
|
|
22
|
+
type=int,
|
|
23
|
+
default=50,
|
|
24
|
+
help="Number of contrastive pairs per benchmark (default: 50)"
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--device",
|
|
28
|
+
type=str,
|
|
29
|
+
default=None,
|
|
30
|
+
help="Device to use (cuda/mps/cpu). Auto-detected if not specified."
|
|
31
|
+
)
|
|
@@ -101,6 +101,26 @@ def setup_generate_vector_from_task_parser(parser: argparse.ArgumentParser) -> N
|
|
|
101
101
|
help="Do not L2-normalize steering vectors"
|
|
102
102
|
)
|
|
103
103
|
|
|
104
|
+
# Universal Subspace options (PRISM/TITAN)
|
|
105
|
+
parser.add_argument(
|
|
106
|
+
"--auto-num-directions",
|
|
107
|
+
action="store_true",
|
|
108
|
+
default=False,
|
|
109
|
+
help="Automatically determine num_directions based on explained variance (PRISM/TITAN)"
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--use-universal-basis-init",
|
|
113
|
+
action="store_true",
|
|
114
|
+
default=False,
|
|
115
|
+
help="Initialize directions from universal basis (PRISM/TITAN)"
|
|
116
|
+
)
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"--num-directions",
|
|
119
|
+
type=int,
|
|
120
|
+
default=3,
|
|
121
|
+
help="Number of steering directions for PRISM/TITAN (default: 3)"
|
|
122
|
+
)
|
|
123
|
+
|
|
104
124
|
# Intermediate file handling
|
|
105
125
|
parser.add_argument(
|
|
106
126
|
"--keep-intermediate",
|
|
@@ -39,6 +39,7 @@ from wisent.core.parser_arguments.optimize_weights_parser import setup_optimize_
|
|
|
39
39
|
from wisent.core.parser_arguments.train_unified_goodness_parser import setup_train_unified_goodness_parser
|
|
40
40
|
from wisent.core.parser_arguments.optimize_parser import setup_optimize_parser
|
|
41
41
|
from wisent.core.parser_arguments.check_linearity_parser import setup_check_linearity_parser
|
|
42
|
+
from wisent.core.parser_arguments.cluster_benchmarks_parser import setup_cluster_benchmarks_parser
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
def setup_parser() -> argparse.ArgumentParser:
|
|
@@ -217,4 +218,11 @@ def setup_parser() -> argparse.ArgumentParser:
|
|
|
217
218
|
)
|
|
218
219
|
setup_check_linearity_parser(check_linearity_parser)
|
|
219
220
|
|
|
221
|
+
# Cluster benchmarks command - cluster benchmarks by direction similarity
|
|
222
|
+
cluster_benchmarks_parser = subparsers.add_parser(
|
|
223
|
+
"cluster-benchmarks",
|
|
224
|
+
help="Cluster benchmarks by direction similarity with geometry analysis"
|
|
225
|
+
)
|
|
226
|
+
setup_cluster_benchmarks_parser(cluster_benchmarks_parser)
|
|
227
|
+
|
|
220
228
|
return parser
|
|
@@ -68,6 +68,12 @@ def setup_steering_optimizer_parser(parser):
|
|
|
68
68
|
default="./baseline_comparison",
|
|
69
69
|
help="Directory to save baseline comparison results (default: ./baseline_comparison)",
|
|
70
70
|
)
|
|
71
|
+
comprehensive_parser.add_argument(
|
|
72
|
+
"--output-dir",
|
|
73
|
+
type=str,
|
|
74
|
+
default="./optimization_results",
|
|
75
|
+
help="Directory to save optimization results (default: ./optimization_results)",
|
|
76
|
+
)
|
|
71
77
|
comprehensive_parser.add_argument(
|
|
72
78
|
"--show-comparisons",
|
|
73
79
|
type=int,
|
|
@@ -170,32 +176,39 @@ def setup_steering_optimizer_parser(parser):
|
|
|
170
176
|
|
|
171
177
|
# Base search space overrides
|
|
172
178
|
comprehensive_parser.add_argument(
|
|
173
|
-
"--search-layers",
|
|
179
|
+
"--search-layers", "--layers",
|
|
174
180
|
type=str,
|
|
175
181
|
default=None,
|
|
182
|
+
dest="search_layers",
|
|
176
183
|
help="Comma-separated layer indices to search (e.g., '8,10,12,14')"
|
|
177
184
|
)
|
|
178
185
|
comprehensive_parser.add_argument(
|
|
179
|
-
"--search-strengths",
|
|
186
|
+
"--search-strengths", "--strengths",
|
|
180
187
|
type=str,
|
|
181
188
|
default=None,
|
|
189
|
+
dest="search_strengths",
|
|
182
190
|
help="Comma-separated strength values to search (e.g., '0.5,1.0,1.5,2.0')"
|
|
183
191
|
)
|
|
184
192
|
comprehensive_parser.add_argument(
|
|
185
|
-
"--search-strategies",
|
|
193
|
+
"--search-strategies", "--strategies",
|
|
186
194
|
type=str,
|
|
187
|
-
nargs="+",
|
|
188
195
|
default=None,
|
|
189
|
-
|
|
190
|
-
help="
|
|
196
|
+
dest="search_strategies",
|
|
197
|
+
help="Comma-separated steering strategies to search (e.g., 'constant,initial_only,diminishing,increasing,gaussian')"
|
|
191
198
|
)
|
|
192
199
|
comprehensive_parser.add_argument(
|
|
193
|
-
"--search-token-aggregations",
|
|
200
|
+
"--search-token-aggregations", "--token-aggregations",
|
|
194
201
|
type=str,
|
|
195
|
-
nargs="+",
|
|
196
202
|
default=None,
|
|
197
|
-
|
|
198
|
-
help="
|
|
203
|
+
dest="search_token_aggregations",
|
|
204
|
+
help="Comma-separated token aggregation strategies (e.g., 'last_token,mean_pooling,first_token,max_pooling,continuation_token,choice_token')"
|
|
205
|
+
)
|
|
206
|
+
comprehensive_parser.add_argument(
|
|
207
|
+
"--search-prompt-constructions", "--prompt-constructions",
|
|
208
|
+
type=str,
|
|
209
|
+
default=None,
|
|
210
|
+
dest="search_prompt_constructions",
|
|
211
|
+
help="Comma-separated prompt construction strategies (e.g., 'chat_template,direct_completion,instruction_following,multiple_choice,role_playing')"
|
|
199
212
|
)
|
|
200
213
|
|
|
201
214
|
# PRISM-specific search space
|
|
@@ -615,3 +628,97 @@ def setup_steering_optimizer_parser(parser):
|
|
|
615
628
|
action="store_true",
|
|
616
629
|
help="Save optimal parameters as default for this model/task combination",
|
|
617
630
|
)
|
|
631
|
+
|
|
632
|
+
# ==========================================================================
|
|
633
|
+
# UNIVERSAL METHOD OPTIMIZER (NEW)
|
|
634
|
+
# ==========================================================================
|
|
635
|
+
# This optimizer uses the universal train(pair_set) interface that ALL
|
|
636
|
+
# steering methods implement, ensuring it works with any method including
|
|
637
|
+
# future ones.
|
|
638
|
+
|
|
639
|
+
universal_parser = steering_subparsers.add_parser(
|
|
640
|
+
"universal",
|
|
641
|
+
help="Universal optimizer that works with ANY steering method (recommended)"
|
|
642
|
+
)
|
|
643
|
+
universal_parser.add_argument("model", type=str, help="Model name or path")
|
|
644
|
+
universal_parser.add_argument(
|
|
645
|
+
"--task",
|
|
646
|
+
type=str,
|
|
647
|
+
required=True,
|
|
648
|
+
help="Task/benchmark to optimize for (e.g., truthfulqa_generation, arc_easy)"
|
|
649
|
+
)
|
|
650
|
+
universal_parser.add_argument(
|
|
651
|
+
"--method",
|
|
652
|
+
type=str,
|
|
653
|
+
default="CAA",
|
|
654
|
+
choices=AVAILABLE_METHODS + [m.lower() for m in AVAILABLE_METHODS],
|
|
655
|
+
help=f"Steering method to optimize. Available: {', '.join(AVAILABLE_METHODS)} (default: CAA)"
|
|
656
|
+
)
|
|
657
|
+
universal_parser.add_argument(
|
|
658
|
+
"--limit",
|
|
659
|
+
type=int,
|
|
660
|
+
default=100,
|
|
661
|
+
help="Maximum samples to use (default: 100)"
|
|
662
|
+
)
|
|
663
|
+
universal_parser.add_argument(
|
|
664
|
+
"--quick",
|
|
665
|
+
action="store_true",
|
|
666
|
+
help="Use reduced search space for faster testing"
|
|
667
|
+
)
|
|
668
|
+
universal_parser.add_argument(
|
|
669
|
+
"--max-configs",
|
|
670
|
+
type=int,
|
|
671
|
+
default=None,
|
|
672
|
+
help="Maximum number of configurations to test (default: all)"
|
|
673
|
+
)
|
|
674
|
+
universal_parser.add_argument(
|
|
675
|
+
"--output-dir",
|
|
676
|
+
type=str,
|
|
677
|
+
default="./optimization_results",
|
|
678
|
+
help="Directory to save results (default: ./optimization_results)"
|
|
679
|
+
)
|
|
680
|
+
universal_parser.add_argument(
|
|
681
|
+
"--save-best-vector",
|
|
682
|
+
action="store_true",
|
|
683
|
+
help="Save the best steering vector to output directory"
|
|
684
|
+
)
|
|
685
|
+
universal_parser.add_argument("--device", type=str, default=None, help="Device to run on")
|
|
686
|
+
universal_parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
|
|
687
|
+
|
|
688
|
+
# Search space customization
|
|
689
|
+
universal_parser.add_argument(
|
|
690
|
+
"--layers",
|
|
691
|
+
type=str,
|
|
692
|
+
default=None,
|
|
693
|
+
help="Comma-separated layer indices to search (e.g., '8,10,12,14')"
|
|
694
|
+
)
|
|
695
|
+
universal_parser.add_argument(
|
|
696
|
+
"--strengths",
|
|
697
|
+
type=str,
|
|
698
|
+
default=None,
|
|
699
|
+
help="Comma-separated strength values to search (e.g., '0.5,1.0,1.5,2.0')"
|
|
700
|
+
)
|
|
701
|
+
universal_parser.add_argument(
|
|
702
|
+
"--token-aggregations",
|
|
703
|
+
type=str,
|
|
704
|
+
nargs="+",
|
|
705
|
+
default=None,
|
|
706
|
+
choices=["last_token", "mean_pooling", "first_token", "max_pooling", "continuation_token"],
|
|
707
|
+
help="Token aggregation strategies to search"
|
|
708
|
+
)
|
|
709
|
+
universal_parser.add_argument(
|
|
710
|
+
"--prompt-strategies",
|
|
711
|
+
type=str,
|
|
712
|
+
nargs="+",
|
|
713
|
+
default=None,
|
|
714
|
+
choices=["chat_template", "direct_completion", "multiple_choice", "role_playing", "instruction_following"],
|
|
715
|
+
help="Prompt construction strategies to search"
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Method-specific parameter overrides (JSON format)
|
|
719
|
+
universal_parser.add_argument(
|
|
720
|
+
"--method-params",
|
|
721
|
+
type=str,
|
|
722
|
+
default=None,
|
|
723
|
+
help='JSON dict of method-specific parameter ranges, e.g., \'{"num_directions": [2, 3, 5]}\''
|
|
724
|
+
)
|
|
@@ -150,6 +150,12 @@ def setup_optimize_weights_parser(parser: argparse.ArgumentParser) -> None:
|
|
|
150
150
|
default=5,
|
|
151
151
|
help="Save checkpoint and best model every N trials. Default: 5"
|
|
152
152
|
)
|
|
153
|
+
parser.add_argument(
|
|
154
|
+
"--s3-bucket",
|
|
155
|
+
type=str,
|
|
156
|
+
default=None,
|
|
157
|
+
help="S3 bucket to upload results to (e.g., 'wisent-optimization-results'). Results will be uploaded on completion."
|
|
158
|
+
)
|
|
153
159
|
|
|
154
160
|
# ==========================================================================
|
|
155
161
|
# EVALUATION CONFIGURATION
|