wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +22 -6
- wisent/core/activations/activations.py +21 -39
- wisent/core/activations/activations_collector.py +141 -373
- wisent/core/activations/classifier_inference_strategy.py +194 -0
- wisent/core/activations/core/atoms.py +8 -92
- wisent/core/activations/extraction_strategy.py +308 -0
- wisent/core/agent/diagnose/response_diagnostics.py +3 -3
- wisent/core/agent/diagnose.py +3 -3
- wisent/core/autonomous_agent.py +2 -2
- wisent/core/cli/agent/apply_steering.py +23 -27
- wisent/core/cli/agent/evaluate_response.py +18 -20
- wisent/core/cli/agent/train_classifier.py +18 -20
- wisent/core/cli/cluster_benchmarks.py +472 -0
- wisent/core/cli/create_steering_vector.py +13 -5
- wisent/core/cli/generate_vector_from_task.py +4 -0
- wisent/core/cli/get_activations.py +12 -36
- wisent/core/cli/method_optimizer.py +859 -0
- wisent/core/cli/optimize.py +44 -5
- wisent/core/cli/optimize_classification.py +5 -6
- wisent/core/cli/optimize_sample_size.py +8 -22
- wisent/core/cli/optimize_steering.py +429 -153
- wisent/core/cli/optimize_weights.py +65 -6
- wisent/core/cli/steering_method_trainer.py +5 -4
- wisent/core/cli/steering_search_space.py +20 -15
- wisent/core/cli/tasks.py +14 -43
- wisent/core/cli/train_unified_goodness.py +17 -18
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
- wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
- wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
- wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
- wisent/core/evaluators/personalization/coherence.py +46 -0
- wisent/core/hyperparameter_optimizer.py +13 -13
- wisent/core/lm_eval_harness_ground_truth.py +7 -11
- wisent/core/main.py +3 -0
- wisent/core/models/wisent_model.py +8 -7
- wisent/core/opti/methods/opti_weights.py +29 -2
- wisent/core/optuna/classifier/activation_generator.py +14 -12
- wisent/core/optuna/steering/steering_optimization.py +14 -9
- wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
- wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
- wisent/core/parser_arguments/tasks_parser.py +7 -19
- wisent/core/steering_methods/core/atoms.py +1 -2
- wisent/core/steering_methods/methods/caa.py +1 -1
- wisent/core/steering_methods/methods/hyperplane.py +74 -0
- wisent/core/steering_methods/methods/prism.py +1 -2
- wisent/core/steering_methods/methods/pulse.py +39 -8
- wisent/core/steering_methods/methods/titan.py +59 -14
- wisent/core/steering_methods/registry.py +52 -12
- wisent/core/steering_optimizer.py +15 -15
- wisent/core/trainers/steering_trainer.py +9 -18
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
- wisent/scripts/run_quality_metrics_sweep.sh +22 -27
- wisent/tests/test_aggregation_geometry.py +236 -0
- wisent/tests/test_detector_accuracy.py +163 -0
- wisent/tests/test_geometry_exhaustive.py +1202 -0
- wisent/tests/visualize_geometry.py +255 -61
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
- wisent/core/activations/prompt_construction_strategy.py +0 -47
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
- wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
- wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
- wisent/examples/scripts/results/benchmark_tags.json +0 -917
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
- wisent/examples/scripts/results/failing_benchmarks.json +0 -946
- wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
- wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
- wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
- wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
- wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
- wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
- wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
- wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
- wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anli_pairs.json +0 -8
- wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
- wisent/examples/scripts/results/test_apps_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
- wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
- wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
- wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/results/test_atis_pairs.json +0 -8
- wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babi_pairs.json +0 -8
- wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
- wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
- wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
- wisent/examples/scripts/results/test_c4_pairs.json +0 -8
- wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cb_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
- wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cola_pairs.json +0 -8
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
- wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
- wisent/examples/scripts/results/test_conala_pairs.json +0 -8
- wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_concode_pairs.json +0 -8
- wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copa_pairs.json +0 -8
- wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
- wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
- wisent/examples/scripts/results/test_drop_pairs.json +0 -8
- wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_escola_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fda_pairs.json +0 -8
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
- wisent/examples/scripts/results/test_flores_pairs.json +0 -14
- wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
- wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
- wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
- wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
- wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
- wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
- wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
- wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
- wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
- wisent/examples/scripts/results/test_llama_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math500_pairs.json +0 -8
- wisent/examples/scripts/results/test_math_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math_pairs.json +0 -8
- wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
- wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mela_pairs.json +0 -14
- wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
- wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
- wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
- wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
- wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
- wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
- wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
- wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
- wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
- wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
- wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
- wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
- wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
- wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prost_pairs.json +0 -8
- wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
- wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
- wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
- wisent/examples/scripts/results/test_race_evaluation.json +0 -30
- wisent/examples/scripts/results/test_race_pairs.json +0 -8
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
- wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_recode_pairs.json +0 -8
- wisent/examples/scripts/results/test_record_evaluation.json +0 -30
- wisent/examples/scripts/results/test_record_pairs.json +0 -8
- wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
- wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
- wisent/examples/scripts/results/test_score_evaluation.json +0 -51
- wisent/examples/scripts/results/test_score_pairs.json +0 -14
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
- wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
- wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_swag_pairs.json +0 -8
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
- wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
- wisent/examples/scripts/results/test_translation_pairs.json +0 -14
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
- wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
- wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
- wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
- wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
- wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
- wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
- wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
- wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
- wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
- wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
- wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
- wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
|
@@ -8,8 +8,7 @@ from enum import Enum
|
|
|
8
8
|
|
|
9
9
|
import torch
|
|
10
10
|
|
|
11
|
-
from wisent.core.activations.
|
|
12
|
-
from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
|
|
11
|
+
from wisent.core.activations.extraction_strategy import ExtractionStrategy
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
class LinearityVerdict(Enum):
|
|
@@ -35,11 +34,8 @@ class LinearityConfig:
|
|
|
35
34
|
layers_to_test: Optional[List[int]] = None
|
|
36
35
|
"""Specific layers to test. If None, tests sample across depth."""
|
|
37
36
|
|
|
38
|
-
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
prompt_strategies: Optional[List[PromptConstructionStrategy]] = None
|
|
42
|
-
"""Prompt strategies to test. If None, tests all."""
|
|
37
|
+
extraction_strategies: Optional[List[ExtractionStrategy]] = None
|
|
38
|
+
"""Extraction strategies to test. If None, tests default set."""
|
|
43
39
|
|
|
44
40
|
normalize_options: List[bool] = field(default_factory=lambda: [False, True])
|
|
45
41
|
"""Normalization options to test."""
|
|
@@ -128,24 +124,15 @@ def check_linearity(
|
|
|
128
124
|
else:
|
|
129
125
|
layers_to_test = cfg.layers_to_test
|
|
130
126
|
|
|
131
|
-
# Determine
|
|
132
|
-
if cfg.
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
]
|
|
138
|
-
else:
|
|
139
|
-
aggregation_strategies = cfg.aggregation_strategies
|
|
140
|
-
|
|
141
|
-
# Determine prompt strategies
|
|
142
|
-
if cfg.prompt_strategies is None:
|
|
143
|
-
prompt_strategies = [
|
|
144
|
-
PromptConstructionStrategy.CHAT_TEMPLATE,
|
|
145
|
-
PromptConstructionStrategy.DIRECT_COMPLETION,
|
|
127
|
+
# Determine extraction strategies
|
|
128
|
+
if cfg.extraction_strategies is None:
|
|
129
|
+
extraction_strategies = [
|
|
130
|
+
ExtractionStrategy.CHAT_LAST,
|
|
131
|
+
ExtractionStrategy.CHAT_MEAN,
|
|
132
|
+
ExtractionStrategy.CHAT_MAX_NORM,
|
|
146
133
|
]
|
|
147
134
|
else:
|
|
148
|
-
|
|
135
|
+
extraction_strategies = cfg.extraction_strategies
|
|
149
136
|
|
|
150
137
|
# Limit pairs
|
|
151
138
|
test_pairs = pairs[:cfg.max_pairs]
|
|
@@ -157,62 +144,59 @@ def check_linearity(
|
|
|
157
144
|
|
|
158
145
|
all_results = []
|
|
159
146
|
|
|
160
|
-
for
|
|
161
|
-
for
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
prompt_strategy=prompt_strategy,
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
pos_la = pair_with_acts.positive_response.layers_activations
|
|
178
|
-
neg_la = pair_with_acts.negative_response.layers_activations
|
|
179
|
-
|
|
180
|
-
if pos_la and neg_la:
|
|
181
|
-
for layer in layers_to_test:
|
|
182
|
-
pos_t = pos_la.get(str(layer))
|
|
183
|
-
neg_t = neg_la.get(str(layer))
|
|
184
|
-
if pos_t is not None and neg_t is not None:
|
|
185
|
-
pos_activations[layer].append(pos_t.flatten().cpu())
|
|
186
|
-
neg_activations[layer].append(neg_t.flatten().cpu())
|
|
187
|
-
except Exception:
|
|
188
|
-
continue
|
|
189
|
-
|
|
190
|
-
# Analyze each layer
|
|
191
|
-
for layer in layers_to_test:
|
|
192
|
-
pos_list = pos_activations[layer]
|
|
193
|
-
neg_list = neg_activations[layer]
|
|
194
|
-
|
|
195
|
-
if len(pos_list) < 10 or len(neg_list) < 10:
|
|
196
|
-
continue
|
|
147
|
+
for strategy in extraction_strategies:
|
|
148
|
+
for normalize in cfg.normalize_options:
|
|
149
|
+
# Collect activations
|
|
150
|
+
pos_activations = {l: [] for l in layers_to_test}
|
|
151
|
+
neg_activations = {l: [] for l in layers_to_test}
|
|
152
|
+
|
|
153
|
+
for pair in test_pairs:
|
|
154
|
+
try:
|
|
155
|
+
pair_with_acts = collector.collect(
|
|
156
|
+
pair,
|
|
157
|
+
strategy=strategy,
|
|
158
|
+
layers=[str(l) for l in layers_to_test],
|
|
159
|
+
normalize=normalize,
|
|
160
|
+
)
|
|
197
161
|
|
|
198
|
-
|
|
199
|
-
|
|
162
|
+
pos_la = pair_with_acts.positive_response.layers_activations
|
|
163
|
+
neg_la = pair_with_acts.negative_response.layers_activations
|
|
200
164
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
165
|
+
if pos_la and neg_la:
|
|
166
|
+
for layer in layers_to_test:
|
|
167
|
+
pos_t = pos_la.get(str(layer))
|
|
168
|
+
neg_t = neg_la.get(str(layer))
|
|
169
|
+
if pos_t is not None and neg_t is not None:
|
|
170
|
+
pos_activations[layer].append(pos_t.flatten().cpu())
|
|
171
|
+
neg_activations[layer].append(neg_t.flatten().cpu())
|
|
172
|
+
except Exception:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
# Analyze each layer
|
|
176
|
+
for layer in layers_to_test:
|
|
177
|
+
pos_list = pos_activations[layer]
|
|
178
|
+
neg_list = neg_activations[layer]
|
|
179
|
+
|
|
180
|
+
if len(pos_list) < 10 or len(neg_list) < 10:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
pos_tensor = torch.stack(pos_list)
|
|
184
|
+
neg_tensor = torch.stack(neg_list)
|
|
185
|
+
|
|
186
|
+
result = detect_geometry_structure(pos_tensor, neg_tensor, geo_config)
|
|
187
|
+
|
|
188
|
+
linear_score = result.all_scores["linear"].score
|
|
189
|
+
linear_details = result.all_scores["linear"].details
|
|
190
|
+
|
|
191
|
+
all_results.append({
|
|
192
|
+
"extraction_strategy": strategy.value,
|
|
193
|
+
"normalize": normalize,
|
|
194
|
+
"layer": layer,
|
|
195
|
+
"linear_score": linear_score,
|
|
196
|
+
"cohens_d": linear_details.get("cohens_d", 0),
|
|
197
|
+
"variance_explained": linear_details.get("variance_explained", 0),
|
|
198
|
+
"best_structure": result.best_structure.value,
|
|
199
|
+
})
|
|
216
200
|
|
|
217
201
|
if not all_results:
|
|
218
202
|
return LinearityResult(
|
|
@@ -234,7 +218,7 @@ def check_linearity(
|
|
|
234
218
|
verdict = LinearityVerdict.LINEAR
|
|
235
219
|
recommendation = (
|
|
236
220
|
f"Use CAA (single-direction steering) on layer {best['layer']} "
|
|
237
|
-
f"with {best['
|
|
221
|
+
f"with {best['extraction_strategy']} strategy."
|
|
238
222
|
)
|
|
239
223
|
elif best["linear_score"] >= cfg.weak_threshold and best["cohens_d"] >= cfg.min_cohens_d:
|
|
240
224
|
verdict = LinearityVerdict.WEAKLY_LINEAR
|
|
@@ -254,8 +238,7 @@ def check_linearity(
|
|
|
254
238
|
verdict=verdict,
|
|
255
239
|
best_linear_score=best["linear_score"],
|
|
256
240
|
best_config={
|
|
257
|
-
"
|
|
258
|
-
"aggregation": best["aggregation"],
|
|
241
|
+
"extraction_strategy": best["extraction_strategy"],
|
|
259
242
|
"normalize": best["normalize"],
|
|
260
243
|
},
|
|
261
244
|
best_layer=best["layer"],
|
|
@@ -281,7 +281,8 @@ def _compute_pca(
|
|
|
281
281
|
|
|
282
282
|
n_components = min(5, n - 1)
|
|
283
283
|
pca = PCA(n_components=n_components)
|
|
284
|
-
|
|
284
|
+
# Convert to float32 for sklearn compatibility (BFloat16 not supported)
|
|
285
|
+
pca.fit(difference_vectors.float().numpy())
|
|
285
286
|
|
|
286
287
|
pc1_var = pca.explained_variance_ratio_[0]
|
|
287
288
|
pc2_var = pca.explained_variance_ratio_[1] if n_components > 1 else 0.0
|
|
@@ -372,7 +373,7 @@ def _compute_clustering(
|
|
|
372
373
|
try:
|
|
373
374
|
from sklearn.metrics import silhouette_score
|
|
374
375
|
|
|
375
|
-
all_activations = torch.cat([positive_activations, negative_activations], dim=0).numpy()
|
|
376
|
+
all_activations = torch.cat([positive_activations, negative_activations], dim=0).float().numpy()
|
|
376
377
|
labels = [0] * n_pos + [1] * n_neg
|
|
377
378
|
|
|
378
379
|
silhouette = silhouette_score(all_activations, labels)
|
|
@@ -436,7 +437,7 @@ def _compute_cv_classification(
|
|
|
436
437
|
from sklearn.linear_model import LogisticRegression
|
|
437
438
|
from sklearn.model_selection import cross_val_score
|
|
438
439
|
|
|
439
|
-
X = torch.cat([positive_activations, negative_activations], dim=0).numpy()
|
|
440
|
+
X = torch.cat([positive_activations, negative_activations], dim=0).float().numpy()
|
|
440
441
|
y = np.array([1] * n_pos + [0] * n_neg)
|
|
441
442
|
|
|
442
443
|
n_folds = min(config.cv_folds, min(n_pos, n_neg))
|
|
@@ -473,8 +474,8 @@ def _compute_cohens_d(
|
|
|
473
474
|
direction = direction / direction_norm
|
|
474
475
|
|
|
475
476
|
# Project all activations onto this direction
|
|
476
|
-
pos_proj = (positive_activations @ direction).numpy()
|
|
477
|
-
neg_proj = (negative_activations @ direction).numpy()
|
|
477
|
+
pos_proj = (positive_activations @ direction).float().numpy()
|
|
478
|
+
neg_proj = (negative_activations @ direction).float().numpy()
|
|
478
479
|
|
|
479
480
|
# Cohen's d = (mean1 - mean2) / pooled_std
|
|
480
481
|
mean_diff = pos_proj.mean() - neg_proj.mean()
|
|
@@ -133,14 +133,11 @@ EXTRACTORS: dict[str, str] = {
|
|
|
133
133
|
|
|
134
134
|
# Coding benchmarks
|
|
135
135
|
"humaneval": f"{base_import}humaneval:HumanEvalExtractor",
|
|
136
|
-
"
|
|
137
|
-
"
|
|
138
|
-
"humaneval_instruct": f"{base_import}
|
|
139
|
-
"
|
|
140
|
-
"
|
|
141
|
-
"mbpp": f"{base_import}mbpp:MBPPExtractor",
|
|
142
|
-
"mbpp_plus": f"{base_import}mbpp:MBPPExtractor",
|
|
143
|
-
"instruct_humaneval": f"{base_import}instructhumaneval:InstructHumanEvalExtractor",
|
|
136
|
+
"humaneval_64": f"{base_import}humaneval:HumanEval64Extractor",
|
|
137
|
+
"humaneval_plus": f"{base_import}humaneval:HumanEvalPlusExtractor",
|
|
138
|
+
"humaneval_instruct": f"{base_import}humaneval:HumanEvalInstructExtractor",
|
|
139
|
+
"humaneval_64_instruct": f"{base_import}humaneval:HumanEval64InstructExtractor",
|
|
140
|
+
"humanevalpack": f"{base_import}humanevalpack:HumanevalpackExtractor",
|
|
144
141
|
"apps": f"{base_import}apps:AppsExtractor",
|
|
145
142
|
"conala": f"{base_import}conala:ConalaExtractor",
|
|
146
143
|
"concode": f"{base_import}concode:ConcodeExtractor",
|
|
@@ -156,13 +153,6 @@ EXTRACTORS: dict[str, str] = {
|
|
|
156
153
|
"multiple_rs": f"{base_import}multipl_e:MultiplEExtractor",
|
|
157
154
|
"multiple_go": f"{base_import}multipl_e:MultiplEExtractor",
|
|
158
155
|
"codexglue": f"{base_import}codexglue:CodexglueExtractor",
|
|
159
|
-
"code_x_glue": f"{base_import}codexglue:CodexglueExtractor",
|
|
160
|
-
"codexglue_code_to_text_python": f"{base_import}codexglue:CodexglueExtractor",
|
|
161
|
-
"codexglue_code_to_text_go": f"{base_import}codexglue:CodexglueExtractor",
|
|
162
|
-
"codexglue_code_to_text_ruby": f"{base_import}codexglue:CodexglueExtractor",
|
|
163
|
-
"codexglue_code_to_text_java": f"{base_import}codexglue:CodexglueExtractor",
|
|
164
|
-
"codexglue_code_to_text_javascript": f"{base_import}codexglue:CodexglueExtractor",
|
|
165
|
-
"codexglue_code_to_text_php": f"{base_import}codexglue:CodexglueExtractor",
|
|
166
156
|
"livecodebench": f"{base_import}livecodebench:LivecodebenchExtractor",
|
|
167
157
|
|
|
168
158
|
# Reasoning benchmarks
|
|
@@ -203,7 +193,6 @@ EXTRACTORS: dict[str, str] = {
|
|
|
203
193
|
"ds1000": f"{base_import}ds1000:Ds1000Extractor",
|
|
204
194
|
"evalita_mp": f"{base_import}evalita_mp:EvalitaMpExtractor",
|
|
205
195
|
"flores": f"{base_import}flores:FloresExtractor",
|
|
206
|
-
"freebase": f"{base_import}freebase:FreebaseExtractor",
|
|
207
196
|
"humanevalpack": f"{base_import}humanevalpack:HumanevalpackExtractor",
|
|
208
197
|
"iwslt2017_ar_en": f"{base_import}iwslt2017_ar_en:Iwslt2017ArEnExtractor",
|
|
209
198
|
"iwslt2017_en_ar": f"{base_import}iwslt2017_en_ar:Iwslt2017EnArExtractor",
|
|
@@ -229,11 +218,8 @@ EXTRACTORS: dict[str, str] = {
|
|
|
229
218
|
"flan_held_in": f"{base_import}flan_held_in:FlanHeldInExtractor",
|
|
230
219
|
"gpt3_translation_benchmarks": f"{base_import}gpt3_translation_benchmarks:Gpt3TranslationBenchmarksExtractor",
|
|
231
220
|
"multiple_choice": f"{base_import}multiple_choice:MultipleChoiceExtractor",
|
|
232
|
-
"non_greedy_robustness_agieval_aqua_rat": f"{base_import}non_greedy_robustness_agieval_aqua_rat:NonGreedyRobustnessAgievalAquaRatExtractor",
|
|
233
|
-
"option_order_robustness_agieval_aqua_rat": f"{base_import}option_order_robustness_agieval_aqua_rat:OptionOrderRobustnessAgievalAquaRatExtractor",
|
|
234
221
|
"penn_treebank": f"{base_import}penn_treebank:PennTreebankExtractor",
|
|
235
222
|
"ptb": f"{base_import}penn_treebank:PennTreebankExtractor",
|
|
236
|
-
"prompt_robustness_agieval_aqua_rat": f"{base_import}prompt_robustness_agieval_aqua_rat:PromptRobustnessAgievalAquaRatExtractor",
|
|
237
223
|
"self_consistency": f"{base_import}self_consistency:SelfConsistencyExtractor",
|
|
238
224
|
"t0_eval": f"{base_import}t0_eval:T0EvalExtractor",
|
|
239
225
|
"vaxx_stance": f"{base_import}vaxx_stance:VaxxStanceExtractor",
|
|
@@ -8,12 +8,16 @@ from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.concode
|
|
|
8
8
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.ds_1000 import Ds1000Extractor
|
|
9
9
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.hle import HleExtractor
|
|
10
10
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.hmmt import HMMTExtractor
|
|
11
|
-
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.humaneval import
|
|
12
|
-
|
|
11
|
+
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.humaneval import (
|
|
12
|
+
HumanEvalExtractor,
|
|
13
|
+
HumanEval64Extractor,
|
|
14
|
+
HumanEvalPlusExtractor,
|
|
15
|
+
HumanEvalInstructExtractor,
|
|
16
|
+
HumanEval64InstructExtractor,
|
|
17
|
+
)
|
|
13
18
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.livecodebench import LivecodebenchExtractor
|
|
14
19
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.livemathbench import LiveMathBenchExtractor
|
|
15
20
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.math500 import MATH500Extractor
|
|
16
|
-
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.mbpp import MBPPExtractor
|
|
17
21
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.mercury import MercuryExtractor
|
|
18
22
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.multipl_e import MultiplEExtractor
|
|
19
23
|
from wisent.core.contrastive_pairs.huggingface_pairs.hf_task_extractors.polymath import PolyMathExtractor
|
|
@@ -30,11 +34,13 @@ __all__ = [
|
|
|
30
34
|
"HleExtractor",
|
|
31
35
|
"HMMTExtractor",
|
|
32
36
|
"HumanEvalExtractor",
|
|
33
|
-
"
|
|
37
|
+
"HumanEval64Extractor",
|
|
38
|
+
"HumanEvalPlusExtractor",
|
|
39
|
+
"HumanEvalInstructExtractor",
|
|
40
|
+
"HumanEval64InstructExtractor",
|
|
34
41
|
"LivecodebenchExtractor",
|
|
35
42
|
"LiveMathBenchExtractor",
|
|
36
43
|
"MATH500Extractor",
|
|
37
|
-
"MBPPExtractor",
|
|
38
44
|
"MercuryExtractor",
|
|
39
45
|
"MultiplEExtractor",
|
|
40
46
|
"PolyMathExtractor",
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import json
|
|
4
|
+
import random
|
|
5
|
+
import re
|
|
3
6
|
from typing import Any
|
|
7
|
+
|
|
4
8
|
from wisent.core.cli_logger import setup_logger
|
|
5
|
-
import json
|
|
6
9
|
|
|
7
10
|
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
8
11
|
from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
|
|
@@ -88,6 +91,9 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
88
91
|
log.debug(f"Could not parse solutions array: {e}")
|
|
89
92
|
return None
|
|
90
93
|
|
|
94
|
+
# Prepend common imports (APPS solutions assume LeetCode-style environment)
|
|
95
|
+
correct_answer = self._prepend_imports(correct_answer)
|
|
96
|
+
|
|
91
97
|
# Create incorrect answer (modify or corrupt)
|
|
92
98
|
incorrect_answer = self._create_incorrect_answer(correct_answer)
|
|
93
99
|
|
|
@@ -96,10 +102,11 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
96
102
|
|
|
97
103
|
# Parse input_output JSON to create test code
|
|
98
104
|
test_code = None
|
|
105
|
+
entry_point = None
|
|
99
106
|
if input_output:
|
|
100
107
|
try:
|
|
101
108
|
io_data = json.loads(input_output) if isinstance(input_output, str) else input_output
|
|
102
|
-
test_code = self._build_test_code_from_io(io_data)
|
|
109
|
+
test_code, entry_point = self._build_test_code_from_io(io_data)
|
|
103
110
|
except (json.JSONDecodeError, TypeError) as e:
|
|
104
111
|
log.debug(f"Could not parse input_output: {e}")
|
|
105
112
|
|
|
@@ -107,6 +114,8 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
107
114
|
"label": "apps",
|
|
108
115
|
"source": "codeparrot/apps",
|
|
109
116
|
"test_code": test_code,
|
|
117
|
+
"entry_point": entry_point,
|
|
118
|
+
"language": "python",
|
|
110
119
|
}
|
|
111
120
|
|
|
112
121
|
return self._build_pair(
|
|
@@ -120,29 +129,82 @@ class AppsExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
120
129
|
log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
|
|
121
130
|
return None
|
|
122
131
|
|
|
123
|
-
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _build_test_code_from_io(io_data: dict) -> tuple[str, str | None]:
|
|
124
134
|
"""Build test code from input/output data.
|
|
125
|
-
|
|
126
|
-
APPS
|
|
127
|
-
|
|
135
|
+
|
|
136
|
+
APPS has two types of problems:
|
|
137
|
+
1. stdin/stdout: No fn_name, run via subprocess
|
|
138
|
+
2. call-based: Has fn_name, import and call Solution().fn_name()
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Tuple of (test_code, entry_point)
|
|
128
142
|
"""
|
|
129
143
|
inputs = io_data.get("inputs", [])
|
|
130
144
|
outputs = io_data.get("outputs", [])
|
|
131
|
-
|
|
132
|
-
if not inputs or not outputs:
|
|
133
|
-
return None
|
|
145
|
+
fn_name = io_data.get("fn_name")
|
|
134
146
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
147
|
+
if not inputs or not outputs:
|
|
148
|
+
return None, None
|
|
149
|
+
|
|
150
|
+
if fn_name:
|
|
151
|
+
return AppsExtractor._build_call_based_test_code(inputs, outputs, fn_name)
|
|
152
|
+
else:
|
|
153
|
+
return AppsExtractor._build_stdin_test_code(inputs, outputs)
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def _build_call_based_test_code(
|
|
157
|
+
inputs: list, outputs: list, fn_name: str
|
|
158
|
+
) -> tuple[str, None]:
|
|
159
|
+
"""Build test code for call-based (LeetCode-style) problems."""
|
|
160
|
+
total = len(inputs)
|
|
161
|
+
test_code = f'''import sys
|
|
162
|
+
from solution import Solution
|
|
163
|
+
from typing import List, Optional, Dict, Tuple, Set, Any
|
|
164
|
+
|
|
165
|
+
def compare_outputs(actual, expected):
|
|
166
|
+
"""Compare outputs, handling floating point and nested structures."""
|
|
167
|
+
if isinstance(expected, float) and isinstance(actual, float):
|
|
168
|
+
return abs(actual - expected) < 1e-6
|
|
169
|
+
if isinstance(expected, list) and isinstance(actual, list):
|
|
170
|
+
if len(expected) != len(actual):
|
|
171
|
+
return False
|
|
172
|
+
return all(compare_outputs(a, e) for a, e in zip(actual, expected))
|
|
173
|
+
return actual == expected
|
|
174
|
+
|
|
175
|
+
if __name__ == '__main__':
|
|
176
|
+
sol = Solution()
|
|
177
|
+
passed = 0
|
|
178
|
+
total = {total}
|
|
179
|
+
'''
|
|
180
|
+
for i, (inp, out) in enumerate(zip(inputs, outputs)):
|
|
181
|
+
# inp is typically a list of arguments
|
|
182
|
+
if isinstance(inp, list):
|
|
183
|
+
args_repr = ", ".join(repr(arg) for arg in inp)
|
|
184
|
+
else:
|
|
185
|
+
args_repr = repr(inp)
|
|
186
|
+
test_code += f" # Test case {i+1}\n"
|
|
187
|
+
test_code += f" try:\n"
|
|
188
|
+
test_code += f" result = sol.{fn_name}({args_repr})\n"
|
|
189
|
+
test_code += f" expected = {repr(out)}\n"
|
|
190
|
+
test_code += f" if compare_outputs(result, expected):\n"
|
|
191
|
+
test_code += f" passed += 1\n"
|
|
192
|
+
test_code += f" except Exception:\n"
|
|
193
|
+
test_code += f" pass\n\n"
|
|
194
|
+
|
|
195
|
+
test_code += " print(f'PASSED:{passed}/{total}')\n"
|
|
196
|
+
test_code += " sys.exit(0 if passed == total else 1)\n"
|
|
197
|
+
return test_code, None
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _build_stdin_test_code(inputs: list, outputs: list) -> tuple[str, None]:
|
|
201
|
+
"""Build test code for stdin/stdout style problems."""
|
|
202
|
+
total = len(inputs)
|
|
203
|
+
test_code = f'''import subprocess
|
|
138
204
|
import sys
|
|
139
205
|
|
|
140
206
|
def normalize_output(s):
|
|
141
|
-
"""Normalize output by stripping trailing whitespace from each line.
|
|
142
|
-
|
|
143
|
-
APPS dataset has inconsistent trailing whitespace in expected outputs.
|
|
144
|
-
This normalizes both actual and expected to enable fair comparison.
|
|
145
|
-
"""
|
|
207
|
+
"""Normalize output by stripping trailing whitespace from each line."""
|
|
146
208
|
lines = s.split('\\n')
|
|
147
209
|
normalized = '\\n'.join(line.rstrip() for line in lines)
|
|
148
210
|
return normalized.strip()
|
|
@@ -157,26 +219,78 @@ def run_solution(input_str):
|
|
|
157
219
|
timeout=10
|
|
158
220
|
)
|
|
159
221
|
if result.returncode != 0:
|
|
160
|
-
raise RuntimeError(f"Solution failed: {result.stderr}")
|
|
222
|
+
raise RuntimeError(f"Solution failed: {{result.stderr}}")
|
|
161
223
|
return result.stdout
|
|
162
224
|
|
|
225
|
+
if __name__ == '__main__':
|
|
226
|
+
passed = 0
|
|
227
|
+
total = {total}
|
|
163
228
|
'''
|
|
164
|
-
test_code += "if __name__ == '__main__':\n"
|
|
165
|
-
|
|
166
229
|
for i, (inp, out) in enumerate(zip(inputs, outputs)):
|
|
167
230
|
test_code += f" # Test case {i+1}\n"
|
|
168
|
-
test_code += f"
|
|
169
|
-
test_code += f"
|
|
170
|
-
test_code += f"
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
231
|
+
test_code += f" try:\n"
|
|
232
|
+
test_code += f" result = run_solution({repr(inp)})\n"
|
|
233
|
+
test_code += f" expected = {repr(out)}\n"
|
|
234
|
+
test_code += f" if normalize_output(result) == normalize_output(expected):\n"
|
|
235
|
+
test_code += f" passed += 1\n"
|
|
236
|
+
test_code += f" except Exception:\n"
|
|
237
|
+
test_code += f" pass\n\n"
|
|
238
|
+
|
|
239
|
+
test_code += " print(f'PASSED:{passed}/{total}')\n"
|
|
240
|
+
test_code += " sys.exit(0 if passed == total else 1)\n"
|
|
241
|
+
return test_code, None
|
|
242
|
+
|
|
243
|
+
# Common imports for LeetCode-style solutions
|
|
244
|
+
COMMON_IMPORTS = """\
|
|
245
|
+
from typing import List, Optional, Dict, Tuple, Set, Any
|
|
246
|
+
import collections
|
|
247
|
+
import heapq
|
|
248
|
+
import itertools
|
|
249
|
+
import functools
|
|
250
|
+
import math
|
|
251
|
+
import bisect
|
|
252
|
+
from collections import defaultdict, Counter, deque
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
@staticmethod
|
|
256
|
+
def _prepend_imports(code: str) -> str:
|
|
257
|
+
"""Prepend common imports to solution code.
|
|
258
|
+
|
|
259
|
+
APPS solutions assume LeetCode-style environment where
|
|
260
|
+
List, collections, heapq, etc. are pre-imported.
|
|
261
|
+
"""
|
|
262
|
+
# Skip if code already has typing imports
|
|
263
|
+
if "from typing import" in code or "import typing" in code:
|
|
264
|
+
return code
|
|
265
|
+
return AppsExtractor.COMMON_IMPORTS + code
|
|
175
266
|
|
|
176
267
|
def _create_incorrect_answer(self, correct: str) -> str:
|
|
177
|
-
"""Create an incorrect answer by
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
268
|
+
"""Create an incorrect answer by shuffling letters in words.
|
|
269
|
+
|
|
270
|
+
This reliably breaks code by corrupting variable/function names,
|
|
271
|
+
causing NameError or SyntaxError.
|
|
272
|
+
"""
|
|
273
|
+
def shuffle_word(word: str) -> str:
|
|
274
|
+
"""Shuffle all letters in a word."""
|
|
275
|
+
if len(word) <= 2:
|
|
276
|
+
return word
|
|
277
|
+
letters = list(word)
|
|
278
|
+
random.shuffle(letters)
|
|
279
|
+
shuffled = ''.join(letters)
|
|
280
|
+
if shuffled == word:
|
|
281
|
+
return word[::-1] # Reverse if shuffle didn't change
|
|
282
|
+
return shuffled
|
|
283
|
+
|
|
284
|
+
def replace_word(match: re.Match) -> str:
|
|
285
|
+
word = match.group(0)
|
|
286
|
+
return shuffle_word(word)
|
|
287
|
+
|
|
288
|
+
# Shuffle words with 3+ characters
|
|
289
|
+
result = re.sub(r'[A-Za-z]{3,}', replace_word, correct)
|
|
290
|
+
|
|
291
|
+
# If nothing changed (all short words), append syntax error
|
|
292
|
+
if result == correct:
|
|
293
|
+
result = correct + "\n!!SYNTAX_ERROR!!"
|
|
294
|
+
|
|
295
|
+
return result
|
|
182
296
|
|
|
@@ -20,7 +20,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
20
20
|
- code: str (code answer/solution)
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
evaluator_name = "generation"
|
|
23
|
+
evaluator_name = "generation"
|
|
24
24
|
|
|
25
25
|
def extract_contrastive_pairs(
|
|
26
26
|
self,
|
|
@@ -82,7 +82,7 @@ class CodexglueExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
82
82
|
incorrect_answer = self._create_incorrect_answer(correct_answer)
|
|
83
83
|
|
|
84
84
|
# Format the question
|
|
85
|
-
formatted_question = f"
|
|
85
|
+
formatted_question = f"{question}\n\nGenerate code based on description:"
|
|
86
86
|
|
|
87
87
|
metadata = {
|
|
88
88
|
"label": "codexglue",
|