wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +22 -6
- wisent/core/activations/activations.py +21 -39
- wisent/core/activations/activations_collector.py +141 -373
- wisent/core/activations/classifier_inference_strategy.py +194 -0
- wisent/core/activations/core/atoms.py +8 -92
- wisent/core/activations/extraction_strategy.py +308 -0
- wisent/core/agent/diagnose/response_diagnostics.py +3 -3
- wisent/core/agent/diagnose.py +3 -3
- wisent/core/autonomous_agent.py +2 -2
- wisent/core/cli/agent/apply_steering.py +23 -27
- wisent/core/cli/agent/evaluate_response.py +18 -20
- wisent/core/cli/agent/train_classifier.py +18 -20
- wisent/core/cli/cluster_benchmarks.py +472 -0
- wisent/core/cli/create_steering_vector.py +13 -5
- wisent/core/cli/generate_vector_from_task.py +4 -0
- wisent/core/cli/get_activations.py +12 -36
- wisent/core/cli/method_optimizer.py +859 -0
- wisent/core/cli/optimize.py +44 -5
- wisent/core/cli/optimize_classification.py +5 -6
- wisent/core/cli/optimize_sample_size.py +8 -22
- wisent/core/cli/optimize_steering.py +429 -153
- wisent/core/cli/optimize_weights.py +65 -6
- wisent/core/cli/steering_method_trainer.py +5 -4
- wisent/core/cli/steering_search_space.py +20 -15
- wisent/core/cli/tasks.py +14 -43
- wisent/core/cli/train_unified_goodness.py +17 -18
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
- wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
- wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
- wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
- wisent/core/evaluators/personalization/coherence.py +46 -0
- wisent/core/hyperparameter_optimizer.py +13 -13
- wisent/core/lm_eval_harness_ground_truth.py +7 -11
- wisent/core/main.py +3 -0
- wisent/core/models/wisent_model.py +8 -7
- wisent/core/opti/methods/opti_weights.py +29 -2
- wisent/core/optuna/classifier/activation_generator.py +14 -12
- wisent/core/optuna/steering/steering_optimization.py +14 -9
- wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
- wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
- wisent/core/parser_arguments/tasks_parser.py +7 -19
- wisent/core/steering_methods/core/atoms.py +1 -2
- wisent/core/steering_methods/methods/caa.py +1 -1
- wisent/core/steering_methods/methods/hyperplane.py +74 -0
- wisent/core/steering_methods/methods/prism.py +1 -2
- wisent/core/steering_methods/methods/pulse.py +39 -8
- wisent/core/steering_methods/methods/titan.py +59 -14
- wisent/core/steering_methods/registry.py +52 -12
- wisent/core/steering_optimizer.py +15 -15
- wisent/core/trainers/steering_trainer.py +9 -18
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
- wisent/scripts/run_quality_metrics_sweep.sh +22 -27
- wisent/tests/test_aggregation_geometry.py +236 -0
- wisent/tests/test_detector_accuracy.py +163 -0
- wisent/tests/test_geometry_exhaustive.py +1202 -0
- wisent/tests/visualize_geometry.py +255 -61
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
- wisent/core/activations/prompt_construction_strategy.py +0 -47
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
- wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
- wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
- wisent/examples/scripts/results/benchmark_tags.json +0 -917
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
- wisent/examples/scripts/results/failing_benchmarks.json +0 -946
- wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
- wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
- wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
- wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
- wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
- wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
- wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
- wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
- wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anli_pairs.json +0 -8
- wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
- wisent/examples/scripts/results/test_apps_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
- wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
- wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
- wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/results/test_atis_pairs.json +0 -8
- wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babi_pairs.json +0 -8
- wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
- wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
- wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
- wisent/examples/scripts/results/test_c4_pairs.json +0 -8
- wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cb_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
- wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cola_pairs.json +0 -8
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
- wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
- wisent/examples/scripts/results/test_conala_pairs.json +0 -8
- wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_concode_pairs.json +0 -8
- wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copa_pairs.json +0 -8
- wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
- wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
- wisent/examples/scripts/results/test_drop_pairs.json +0 -8
- wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_escola_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fda_pairs.json +0 -8
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
- wisent/examples/scripts/results/test_flores_pairs.json +0 -14
- wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
- wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
- wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
- wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
- wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
- wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
- wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
- wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
- wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
- wisent/examples/scripts/results/test_llama_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math500_pairs.json +0 -8
- wisent/examples/scripts/results/test_math_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math_pairs.json +0 -8
- wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
- wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mela_pairs.json +0 -14
- wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
- wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
- wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
- wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
- wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
- wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
- wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
- wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
- wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
- wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
- wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
- wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
- wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
- wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prost_pairs.json +0 -8
- wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
- wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
- wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
- wisent/examples/scripts/results/test_race_evaluation.json +0 -30
- wisent/examples/scripts/results/test_race_pairs.json +0 -8
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
- wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_recode_pairs.json +0 -8
- wisent/examples/scripts/results/test_record_evaluation.json +0 -30
- wisent/examples/scripts/results/test_record_pairs.json +0 -8
- wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
- wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
- wisent/examples/scripts/results/test_score_evaluation.json +0 -51
- wisent/examples/scripts/results/test_score_pairs.json +0 -14
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
- wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
- wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_swag_pairs.json +0 -8
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
- wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
- wisent/examples/scripts/results/test_translation_pairs.json +0 -14
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
- wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
- wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
- wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
- wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
- wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
- wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
- wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
- wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
- wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
- wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
- wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
- wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
|
@@ -1,946 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"total_failures": 235,
|
|
3
|
-
"failures": [
|
|
4
|
-
{
|
|
5
|
-
"benchmark": "math500",
|
|
6
|
-
"error": "No extractor registered for task 'math500'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, basque_benc..."
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"benchmark": "aime2024",
|
|
10
|
-
"error": "No extractor registered for task 'aime2024'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, basque_ben..."
|
|
11
|
-
},
|
|
12
|
-
{
|
|
13
|
-
"benchmark": "aime2025",
|
|
14
|
-
"error": "No extractor registered for task 'aime2025'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, basque_ben..."
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
"benchmark": "hmmt",
|
|
18
|
-
"error": "No extractor registered for task 'hmmt'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, basque_bench, ..."
|
|
19
|
-
},
|
|
20
|
-
{
|
|
21
|
-
"benchmark": "hmmt_feb_2025",
|
|
22
|
-
"error": "No extractor registered for task 'hmmt_feb_2025'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, basqu..."
|
|
23
|
-
},
|
|
24
|
-
{
|
|
25
|
-
"benchmark": "polymath_en_medium",
|
|
26
|
-
"error": "No extractor registered for task 'polymath_en_medium'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, ..."
|
|
27
|
-
},
|
|
28
|
-
{
|
|
29
|
-
"benchmark": "polymath_zh_medium",
|
|
30
|
-
"error": "No extractor registered for task 'polymath_zh_medium'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, ..."
|
|
31
|
-
},
|
|
32
|
-
{
|
|
33
|
-
"benchmark": "polymath_en_high",
|
|
34
|
-
"error": "No extractor registered for task 'polymath_en_high'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, ba..."
|
|
35
|
-
},
|
|
36
|
-
{
|
|
37
|
-
"benchmark": "polymath_zh_high",
|
|
38
|
-
"error": "No extractor registered for task 'polymath_zh_high'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, ba..."
|
|
39
|
-
},
|
|
40
|
-
{
|
|
41
|
-
"benchmark": "livemathbench_cnmo_en",
|
|
42
|
-
"error": "No extractor registered for task 'livemathbench_cnmo_en'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking7..."
|
|
43
|
-
},
|
|
44
|
-
{
|
|
45
|
-
"benchmark": "livemathbench_cnmo_zh",
|
|
46
|
-
"error": "Failed to extract pairs from 'livemathbench_cnmo_zh': Failed to load HuggingFace dataset 'opencompass/LiveMathBench'. Arguments were: config='v202412_CNMO_cn', split='test'. Underlying error: Feature type 'List' not found. Available feature types: ['Value', 'ClassLabel', 'Translation', 'TranslationVariableLanguages', 'LargeList', 'Sequence', 'Array2D', 'Array3D', 'Array4D', 'Array5D', 'Audio', 'Image', 'Video', 'Pdf']"
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
"benchmark": "instruct_humaneval",
|
|
50
|
-
"error": "No extractor registered for task 'instruct_humaneval'. Known: 20_newsgroups, aclue, acp_bench, acp_bench_hard, advanced, aexams, afrimgsm, afrimmlu, afrixnli, ag, ag_news, agieval, ai2_arc, aime, anagrams1, anagrams2, anli, anli_r1, anli_r2, anli_r3, apps, arabculture, arabic, arabic_leaderboard_complete, arabic_leaderboard_light, arabicmmlu, aradice, arc, arc_challenge, arc_easy, argument, argument_topic, arithmetic, asdiv, assin, assin_entailment, atis, babi, babilong, bangla_mmlu, banking77, ..."
|
|
51
|
-
},
|
|
52
|
-
{
|
|
53
|
-
"benchmark": "recode",
|
|
54
|
-
"error": "Failed to extract pairs from 'recode': Failed to load HuggingFace dataset 'code_x_glue_tc_nl_code_search_adv'. Arguments were: config='default', split='train'. Underlying error: Feature type 'List' not found. Available feature types: ['Value', 'ClassLabel', 'Translation', 'TranslationVariableLanguages', 'LargeList', 'Sequence', 'Array2D', 'Array3D', 'Array4D', 'Array5D', 'Audio', 'Image', 'Video', 'Pdf']"
|
|
55
|
-
},
|
|
56
|
-
{
|
|
57
|
-
"benchmark": "bbh",
|
|
58
|
-
"error": "(MaxRetryError('HTTPSConnectionPool(host=\\'huggingface.co\\', port=443): Max retries exceeded with url: /api/datasets/SaylorTwift/bbh/tree/b5306be6f827cfafbb545ff5a51f96916029b0fd/snarks?recursive=True&expand=False (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x358b1cf10>: Failed to resolve \\'huggingface.co\\' ([Errno 8] nodename nor servname provided, or not known)\"))'), '(Request ID: 191b7b73-810f-41c6-88b6-3206e1aefc69)')"
|
|
59
|
-
},
|
|
60
|
-
{
|
|
61
|
-
"benchmark": "siqa",
|
|
62
|
-
"error": "'siqa'"
|
|
63
|
-
},
|
|
64
|
-
{
|
|
65
|
-
"benchmark": "AraDICE",
|
|
66
|
-
"error": "'AraDICE'"
|
|
67
|
-
},
|
|
68
|
-
{
|
|
69
|
-
"benchmark": "ArabCulture",
|
|
70
|
-
"error": "'ArabCulture'"
|
|
71
|
-
},
|
|
72
|
-
{
|
|
73
|
-
"benchmark": "darija_bench",
|
|
74
|
-
"error": "'darija_bench'"
|
|
75
|
-
},
|
|
76
|
-
{
|
|
77
|
-
"benchmark": "ceval",
|
|
78
|
-
"error": "'ceval'"
|
|
79
|
-
},
|
|
80
|
-
{
|
|
81
|
-
"benchmark": "turkishmmlu",
|
|
82
|
-
"error": "Feature type 'List' not found. Available feature types: ['Value', 'ClassLabel', 'Translation', 'TranslationVariableLanguages', 'LargeList', 'Sequence', 'Array2D', 'Array3D', 'Array4D', 'Array5D', 'Audio', 'Image', 'Video', 'Pdf']"
|
|
83
|
-
},
|
|
84
|
-
{
|
|
85
|
-
"benchmark": "bangla_mmlu",
|
|
86
|
-
"error": "'bangla_mmlu'"
|
|
87
|
-
},
|
|
88
|
-
{
|
|
89
|
-
"benchmark": "japanese_leaderboard",
|
|
90
|
-
"error": "`fugashi` is required for `japanese_leaderboard`, please install `fugashi` via pip install lm_eval[japanese_leaderboard] or pip install -e .[japanese_leaderboard]"
|
|
91
|
-
},
|
|
92
|
-
{
|
|
93
|
-
"benchmark": "kormedmcqa",
|
|
94
|
-
"error": "No pairs could be loaded from any subtask of 'kormedmcqa'"
|
|
95
|
-
},
|
|
96
|
-
{
|
|
97
|
-
"benchmark": "basqueglue",
|
|
98
|
-
"error": "'basqueglue'"
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
"benchmark": "eus_exams",
|
|
102
|
-
"error": "'eus_exams'"
|
|
103
|
-
},
|
|
104
|
-
{
|
|
105
|
-
"benchmark": "catalan_bench",
|
|
106
|
-
"error": "(MaxRetryError('HTTPSConnectionPool(host=\\'huggingface.co\\', port=443): Max retries exceeded with url: /api/datasets/gplsi/cocoteros_va/revision/652b16cb3cd15c8a695e39ffdd483b552f233319 (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x41d1da6b0>: Failed to resolve \\'huggingface.co\\' ([Errno 8] nodename nor servname provided, or not known)\"))'), '(Request ID: 98ddb951-e4e5-48d9-8c75-a69a29fbdcd3)')"
|
|
107
|
-
},
|
|
108
|
-
{
|
|
109
|
-
"benchmark": "french_bench",
|
|
110
|
-
"error": "Feature type 'List' not found. Available feature types: ['Value', 'ClassLabel', 'Translation', 'TranslationVariableLanguages', 'LargeList', 'Sequence', 'Array2D', 'Array3D', 'Array4D', 'Array5D', 'Audio', 'Image', 'Video', 'Pdf']"
|
|
111
|
-
},
|
|
112
|
-
{
|
|
113
|
-
"benchmark": "icelandic_winogrande",
|
|
114
|
-
"error": "'icelandic_winogrande'"
|
|
115
|
-
},
|
|
116
|
-
{
|
|
117
|
-
"benchmark": "global_mmlu",
|
|
118
|
-
"error": "'global_mmlu'"
|
|
119
|
-
},
|
|
120
|
-
{
|
|
121
|
-
"benchmark": "global_piqa",
|
|
122
|
-
"error": "'global_piqa'"
|
|
123
|
-
},
|
|
124
|
-
{
|
|
125
|
-
"benchmark": "belebele",
|
|
126
|
-
"error": "No pairs could be loaded from any subtask of 'belebele'"
|
|
127
|
-
},
|
|
128
|
-
{
|
|
129
|
-
"benchmark": "mlqa",
|
|
130
|
-
"error": "'mlqa'"
|
|
131
|
-
},
|
|
132
|
-
{
|
|
133
|
-
"benchmark": "xquad",
|
|
134
|
-
"error": "No pairs could be loaded from any subtask of 'xquad'"
|
|
135
|
-
},
|
|
136
|
-
{
|
|
137
|
-
"benchmark": "xcopa",
|
|
138
|
-
"error": "No pairs could be loaded from any subtask of 'xcopa'"
|
|
139
|
-
},
|
|
140
|
-
{
|
|
141
|
-
"benchmark": "okapi/arc_multilingual",
|
|
142
|
-
"error": "'okapi/arc_multilingual'"
|
|
143
|
-
},
|
|
144
|
-
{
|
|
145
|
-
"benchmark": "okapi/hellaswag_multilingual",
|
|
146
|
-
"error": "'okapi/hellaswag_multilingual'"
|
|
147
|
-
},
|
|
148
|
-
{
|
|
149
|
-
"benchmark": "okapi/mmlu_multilingual",
|
|
150
|
-
"error": "'okapi/mmlu_multilingual'"
|
|
151
|
-
},
|
|
152
|
-
{
|
|
153
|
-
"benchmark": "okapi/truthfulqa_multilingual",
|
|
154
|
-
"error": "'okapi/truthfulqa_multilingual'"
|
|
155
|
-
},
|
|
156
|
-
{
|
|
157
|
-
"benchmark": "toxigen",
|
|
158
|
-
"error": "One of the splits is empty after splitting."
|
|
159
|
-
},
|
|
160
|
-
{
|
|
161
|
-
"benchmark": "moral_stories",
|
|
162
|
-
"error": "One of the splits is empty after splitting."
|
|
163
|
-
},
|
|
164
|
-
{
|
|
165
|
-
"benchmark": "realtoxicityprompts",
|
|
166
|
-
"error": "One of the splits is empty after splitting."
|
|
167
|
-
},
|
|
168
|
-
{
|
|
169
|
-
"benchmark": "discrim_eval",
|
|
170
|
-
"error": "'discrim_eval'"
|
|
171
|
-
},
|
|
172
|
-
{
|
|
173
|
-
"benchmark": "esbbq",
|
|
174
|
-
"error": "'esbbq'"
|
|
175
|
-
},
|
|
176
|
-
{
|
|
177
|
-
"benchmark": "cabbq",
|
|
178
|
-
"error": "'cabbq'"
|
|
179
|
-
},
|
|
180
|
-
{
|
|
181
|
-
"benchmark": "simple_cooccurrence_bias",
|
|
182
|
-
"error": "One of the splits is empty after splitting."
|
|
183
|
-
},
|
|
184
|
-
{
|
|
185
|
-
"benchmark": "winogender",
|
|
186
|
-
"error": "No pairs could be loaded from any subtask of 'winogender'"
|
|
187
|
-
},
|
|
188
|
-
{
|
|
189
|
-
"benchmark": "medmcqa",
|
|
190
|
-
"error": "One of the splits is empty after splitting."
|
|
191
|
-
},
|
|
192
|
-
{
|
|
193
|
-
"benchmark": "med_concepts_qa",
|
|
194
|
-
"error": "No pairs could be loaded from any subtask of 'med_concepts_qa'"
|
|
195
|
-
},
|
|
196
|
-
{
|
|
197
|
-
"benchmark": "meddialog",
|
|
198
|
-
"error": "'meddialog'"
|
|
199
|
-
},
|
|
200
|
-
{
|
|
201
|
-
"benchmark": "mediqa_qa2019",
|
|
202
|
-
"error": "Error loading evaluation metrics: Couldn't find a module script at /Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/examples/scripts/bleurt/bleurt.py. Module 'bleurt' doesn't exist on the Hugging Face Hub either.. Please check your installation."
|
|
203
|
-
},
|
|
204
|
-
{
|
|
205
|
-
"benchmark": "medtext",
|
|
206
|
-
"error": "Error loading evaluation metrics: Couldn't find a module script at /Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/examples/scripts/bleurt/bleurt.py. Module 'bleurt' doesn't exist on the Hugging Face Hub either.. Please check your installation."
|
|
207
|
-
},
|
|
208
|
-
{
|
|
209
|
-
"benchmark": "meqsum",
|
|
210
|
-
"error": "Error loading evaluation metrics: Couldn't find a module script at /Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/examples/scripts/bleurt/bleurt.py. Module 'bleurt' doesn't exist on the Hugging Face Hub either.. Please check your installation."
|
|
211
|
-
},
|
|
212
|
-
{
|
|
213
|
-
"benchmark": "mimic_repsum",
|
|
214
|
-
"error": "Please install evaluation metrics via pip install evaluate bert-score rouge_score>=0.1.2 nltk absl-py radgraphgit+https://github.com/google-research/bleurt.git"
|
|
215
|
-
},
|
|
216
|
-
{
|
|
217
|
-
"benchmark": "minerva_math",
|
|
218
|
-
"error": "Unsupported ANTLR version 4.7.2, only 4.9.3, 4.11.0, and 4.13.2 runtime versions are supported."
|
|
219
|
-
},
|
|
220
|
-
{
|
|
221
|
-
"benchmark": "mgsm",
|
|
222
|
-
"error": "'mgsm'"
|
|
223
|
-
},
|
|
224
|
-
{
|
|
225
|
-
"benchmark": "hrm8k",
|
|
226
|
-
"error": "No pairs could be loaded from any subtask of 'hrm8k'"
|
|
227
|
-
},
|
|
228
|
-
{
|
|
229
|
-
"benchmark": "babi",
|
|
230
|
-
"error": "'valid'"
|
|
231
|
-
},
|
|
232
|
-
{
|
|
233
|
-
"benchmark": "babilong",
|
|
234
|
-
"error": "'babilong'"
|
|
235
|
-
},
|
|
236
|
-
{
|
|
237
|
-
"benchmark": "longbench",
|
|
238
|
-
"error": "(MaxRetryError('HTTPSConnectionPool(host=\\'huggingface.co\\', port=443): Max retries exceeded with url: /api/datasets/THUDM/LongBench/revision/5e628be450b7e67fb7ae6e201bd6d8f7056f7672 (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x3741478b0>: Failed to resolve \\'huggingface.co\\' ([Errno 8] nodename nor servname provided, or not known)\"))'), '(Request ID: 0435569f-7b06-41dd-866e-d040c34fee01)')"
|
|
239
|
-
},
|
|
240
|
-
{
|
|
241
|
-
"benchmark": "longbenchv2",
|
|
242
|
-
"error": "'longbenchv2'"
|
|
243
|
-
},
|
|
244
|
-
{
|
|
245
|
-
"benchmark": "ruler",
|
|
246
|
-
"error": "HTTPConnectionPool(host='curtis.ml.cmu.edu', port=80): Max retries exceeded with url: /datasets/hotpot/hotpot_dev_distractor_v1.json (Caused by NameResolutionError(\"<urllib3.connection.HTTPConnection object at 0x364c5c7f0>: Failed to resolve 'curtis.ml.cmu.edu' ([Errno 8] nodename nor servname provided, or not known)\"))"
|
|
247
|
-
},
|
|
248
|
-
{
|
|
249
|
-
"benchmark": "scrolls",
|
|
250
|
-
"error": "'scrolls'"
|
|
251
|
-
},
|
|
252
|
-
{
|
|
253
|
-
"benchmark": "inverse_scaling",
|
|
254
|
-
"error": "'inverse_scaling'"
|
|
255
|
-
},
|
|
256
|
-
{
|
|
257
|
-
"benchmark": "storycloze",
|
|
258
|
-
"error": "The dataset story_cloze with config 2016 requires manual data."
|
|
259
|
-
},
|
|
260
|
-
{
|
|
261
|
-
"benchmark": "histoires_morales",
|
|
262
|
-
"error": "One of the splits is empty after splitting."
|
|
263
|
-
},
|
|
264
|
-
{
|
|
265
|
-
"benchmark": "groundcocoa",
|
|
266
|
-
"error": "(MaxRetryError('HTTPSConnectionPool(host=\\'huggingface.co\\', port=443): Max retries exceeded with url: /api/datasets/harsh147/GroundCocoa/revision/2f4bc9a10c17062001870e2e2d0518cb4fc9740e (Caused by NameResolutionError(\"<urllib3.connection.HTTPSConnection object at 0x3840a3c10>: Failed to resolve \\'huggingface.co\\' ([Errno 8] nodename nor servname provided, or not known)\"))'), '(Request ID: 175bd0e6-809e-459e-ba1a-9ba9408b36e2)')"
|
|
267
|
-
},
|
|
268
|
-
{
|
|
269
|
-
"benchmark": "blimp",
|
|
270
|
-
"error": "(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: df2181ba-3259-4ebc-ac3d-cb2fd6a67f01)')"
|
|
271
|
-
},
|
|
272
|
-
{
|
|
273
|
-
"benchmark": "blimp_nl",
|
|
274
|
-
"error": "'blimp_nl'"
|
|
275
|
-
},
|
|
276
|
-
{
|
|
277
|
-
"benchmark": "multiblimp",
|
|
278
|
-
"error": "(ReadTimeoutError(\"HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)\"), '(Request ID: 1808dfbf-bac5-491c-a884-e993184ee3ac)')"
|
|
279
|
-
},
|
|
280
|
-
{
|
|
281
|
-
"benchmark": "turblimp_core",
|
|
282
|
-
"error": "'turblimp_core'"
|
|
283
|
-
},
|
|
284
|
-
{
|
|
285
|
-
"benchmark": "zhoblimp",
|
|
286
|
-
"error": "'zhoblimp'"
|
|
287
|
-
},
|
|
288
|
-
{
|
|
289
|
-
"benchmark": "lambada_multilingual_stablelm",
|
|
290
|
-
"error": "'lambada_multilingual_stablelm'"
|
|
291
|
-
},
|
|
292
|
-
{
|
|
293
|
-
"benchmark": "paws-x",
|
|
294
|
-
"error": "'paws-x'"
|
|
295
|
-
},
|
|
296
|
-
{
|
|
297
|
-
"benchmark": "translation",
|
|
298
|
-
"error": "(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 8c72b72b-e44f-4764-bb98-d2bb995dcf35)')"
|
|
299
|
-
},
|
|
300
|
-
{
|
|
301
|
-
"benchmark": "wmt2016",
|
|
302
|
-
"error": "'wmt2016'"
|
|
303
|
-
},
|
|
304
|
-
{
|
|
305
|
-
"benchmark": "code_x_glue",
|
|
306
|
-
"error": "'code_x_glue'"
|
|
307
|
-
},
|
|
308
|
-
{
|
|
309
|
-
"benchmark": "humaneval_infilling",
|
|
310
|
-
"error": "'humaneval_infilling'"
|
|
311
|
-
},
|
|
312
|
-
{
|
|
313
|
-
"benchmark": "bertaqa",
|
|
314
|
-
"error": "Feature type 'List' not found. Available feature types: ['Value', 'ClassLabel', 'Translation', 'TranslationVariableLanguages', 'LargeList', 'Sequence', 'Array2D', 'Array3D', 'Array4D', 'Array5D', 'Audio', 'Image', 'Video', 'Pdf']"
|
|
315
|
-
},
|
|
316
|
-
{
|
|
317
|
-
"benchmark": "careqa",
|
|
318
|
-
"error": "'careqa'"
|
|
319
|
-
},
|
|
320
|
-
{
|
|
321
|
-
"benchmark": "acp_bench",
|
|
322
|
-
"error": "No pairs could be loaded from any subtask of 'acp_bench'"
|
|
323
|
-
},
|
|
324
|
-
{
|
|
325
|
-
"benchmark": "acp_bench_hard",
|
|
326
|
-
"error": "No pairs could be loaded from any subtask of 'acp_bench_hard'"
|
|
327
|
-
},
|
|
328
|
-
{
|
|
329
|
-
"benchmark": "aexams",
|
|
330
|
-
"error": "Some evaluations incorrect"
|
|
331
|
-
},
|
|
332
|
-
{
|
|
333
|
-
"benchmark": "benchmarks",
|
|
334
|
-
"error": "'benchmarks'"
|
|
335
|
-
},
|
|
336
|
-
{
|
|
337
|
-
"benchmark": "bhs",
|
|
338
|
-
"error": "'bhs'"
|
|
339
|
-
},
|
|
340
|
-
{
|
|
341
|
-
"benchmark": "c4",
|
|
342
|
-
"error": "One of the splits is empty after splitting."
|
|
343
|
-
},
|
|
344
|
-
{
|
|
345
|
-
"benchmark": "chartqa",
|
|
346
|
-
"error": "One of the splits is empty after splitting."
|
|
347
|
-
},
|
|
348
|
-
{
|
|
349
|
-
"benchmark": "click",
|
|
350
|
-
"error": "'click'"
|
|
351
|
-
},
|
|
352
|
-
{
|
|
353
|
-
"benchmark": "eq-bench_ca",
|
|
354
|
-
"error": "'eq-bench_ca'"
|
|
355
|
-
},
|
|
356
|
-
{
|
|
357
|
-
"benchmark": "eq-bench_es",
|
|
358
|
-
"error": "'eq-bench_es'"
|
|
359
|
-
},
|
|
360
|
-
{
|
|
361
|
-
"benchmark": "evalita_LLM",
|
|
362
|
-
"error": "'evalita_LLM'"
|
|
363
|
-
},
|
|
364
|
-
{
|
|
365
|
-
"benchmark": "fda",
|
|
366
|
-
"error": "Some evaluations incorrect"
|
|
367
|
-
},
|
|
368
|
-
{
|
|
369
|
-
"benchmark": "fld",
|
|
370
|
-
"error": "'fld'"
|
|
371
|
-
},
|
|
372
|
-
{
|
|
373
|
-
"benchmark": "jsonschema_bench",
|
|
374
|
-
"error": "No pairs could be loaded from any subtask of 'jsonschema_bench'"
|
|
375
|
-
},
|
|
376
|
-
{
|
|
377
|
-
"benchmark": "kbl",
|
|
378
|
-
"error": "No pairs could be loaded from any subtask of 'kbl'"
|
|
379
|
-
},
|
|
380
|
-
{
|
|
381
|
-
"benchmark": "leaderboard",
|
|
382
|
-
"error": "Unsupported ANTLR version 4.7.2, only 4.9.3, 4.11.0, and 4.13.2 runtime versions are supported."
|
|
383
|
-
},
|
|
384
|
-
{
|
|
385
|
-
"benchmark": "libra",
|
|
386
|
-
"error": "No pairs could be loaded from any subtask of 'libra'"
|
|
387
|
-
},
|
|
388
|
-
{
|
|
389
|
-
"benchmark": "lingoly",
|
|
390
|
-
"error": "Dataset 'ambean/lingOly' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/ambean/lingOly to ask for access."
|
|
391
|
-
},
|
|
392
|
-
{
|
|
393
|
-
"benchmark": "llama3",
|
|
394
|
-
"error": "'llama3'"
|
|
395
|
-
},
|
|
396
|
-
{
|
|
397
|
-
"benchmark": "lm_syneval",
|
|
398
|
-
"error": "'lm_syneval'"
|
|
399
|
-
},
|
|
400
|
-
{
|
|
401
|
-
"benchmark": "mastermind",
|
|
402
|
-
"error": "No pairs could be loaded from any subtask of 'mastermind'"
|
|
403
|
-
},
|
|
404
|
-
{
|
|
405
|
-
"benchmark": "mmlusr",
|
|
406
|
-
"error": "BuilderConfig 'question_and_answer_philosophy' not found. Available: ['answer_only', 'question_only', 'question_and_answer']"
|
|
407
|
-
},
|
|
408
|
-
{
|
|
409
|
-
"benchmark": "mmmu",
|
|
410
|
-
"error": "'mmmu'"
|
|
411
|
-
},
|
|
412
|
-
{
|
|
413
|
-
"benchmark": "model_written_evals",
|
|
414
|
-
"error": "'model_written_evals'"
|
|
415
|
-
},
|
|
416
|
-
{
|
|
417
|
-
"benchmark": "mts_dialog",
|
|
418
|
-
"error": "Please install evaluation metrics via pip install evaluate bert-score rouge_score>=0.1.2 nltk absl-py git+https://github.com/google-research/bleurt.git"
|
|
419
|
-
},
|
|
420
|
-
{
|
|
421
|
-
"benchmark": "multiblimp",
|
|
422
|
-
"error": "Couldn't find cache for jumelet/multiblimp for config 'gle'"
|
|
423
|
-
},
|
|
424
|
-
{
|
|
425
|
-
"benchmark": "noreval",
|
|
426
|
-
"error": "'noreval'"
|
|
427
|
-
},
|
|
428
|
-
{
|
|
429
|
-
"benchmark": "olaph",
|
|
430
|
-
"error": "Please install evaluation metrics via pip install evaluate bert-score rouge_score>=0.1.2 nltk absl-py git+https://github.com/google-research/bleurt.git"
|
|
431
|
-
},
|
|
432
|
-
{
|
|
433
|
-
"benchmark": "paloma",
|
|
434
|
-
"error": "Dataset 'allenai/paloma' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/allenai/paloma to ask for access."
|
|
435
|
-
},
|
|
436
|
-
{
|
|
437
|
-
"benchmark": "pile",
|
|
438
|
-
"error": "'pile'"
|
|
439
|
-
},
|
|
440
|
-
{
|
|
441
|
-
"benchmark": "pile_10k",
|
|
442
|
-
"error": "One of the splits is empty after splitting."
|
|
443
|
-
},
|
|
444
|
-
{
|
|
445
|
-
"benchmark": "polemo2",
|
|
446
|
-
"error": "No pairs could be loaded from any subtask of 'polemo2'"
|
|
447
|
-
},
|
|
448
|
-
{
|
|
449
|
-
"benchmark": "score",
|
|
450
|
-
"error": "'score'"
|
|
451
|
-
},
|
|
452
|
-
{
|
|
453
|
-
"benchmark": "squad_completion",
|
|
454
|
-
"error": "One of the splits is empty after splitting."
|
|
455
|
-
},
|
|
456
|
-
{
|
|
457
|
-
"benchmark": "super_glue",
|
|
458
|
-
"error": "'super_glue'"
|
|
459
|
-
},
|
|
460
|
-
{
|
|
461
|
-
"benchmark": "swde",
|
|
462
|
-
"error": "One of the splits is empty after splitting."
|
|
463
|
-
},
|
|
464
|
-
{
|
|
465
|
-
"benchmark": "tinyBenchmarks",
|
|
466
|
-
"error": "`tinyBenchmarks` is required for tinyBenchmarks task metric calculation, install via `pip install git+https://github.com/felipemaiapolo/tinyBenchmarks`"
|
|
467
|
-
},
|
|
468
|
-
{
|
|
469
|
-
"benchmark": "truthfulqa-multi",
|
|
470
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
471
|
-
},
|
|
472
|
-
{
|
|
473
|
-
"benchmark": "unitxt",
|
|
474
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
475
|
-
},
|
|
476
|
-
{
|
|
477
|
-
"benchmark": "unscramble",
|
|
478
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
479
|
-
},
|
|
480
|
-
{
|
|
481
|
-
"benchmark": "wmdp",
|
|
482
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
483
|
-
},
|
|
484
|
-
{
|
|
485
|
-
"benchmark": "wsc273",
|
|
486
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
487
|
-
},
|
|
488
|
-
{
|
|
489
|
-
"benchmark": "global_mmlu_ar",
|
|
490
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
491
|
-
},
|
|
492
|
-
{
|
|
493
|
-
"benchmark": "arabic_exams",
|
|
494
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
495
|
-
},
|
|
496
|
-
{
|
|
497
|
-
"benchmark": "persona",
|
|
498
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
499
|
-
},
|
|
500
|
-
{
|
|
501
|
-
"benchmark": "afrixnli_en_direct_amh",
|
|
502
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
503
|
-
},
|
|
504
|
-
{
|
|
505
|
-
"benchmark": "evalita_mp",
|
|
506
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
507
|
-
},
|
|
508
|
-
{
|
|
509
|
-
"benchmark": "truthfulqa",
|
|
510
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
511
|
-
},
|
|
512
|
-
{
|
|
513
|
-
"benchmark": "eus_exams_es",
|
|
514
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
515
|
-
},
|
|
516
|
-
{
|
|
517
|
-
"benchmark": "flores",
|
|
518
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
519
|
-
},
|
|
520
|
-
{
|
|
521
|
-
"benchmark": "afrimgsm_direct_amh",
|
|
522
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
523
|
-
},
|
|
524
|
-
{
|
|
525
|
-
"benchmark": "ceval_valid",
|
|
526
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
527
|
-
},
|
|
528
|
-
{
|
|
529
|
-
"benchmark": "advanced_ai_risk",
|
|
530
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
531
|
-
},
|
|
532
|
-
{
|
|
533
|
-
"benchmark": "tmlu",
|
|
534
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
535
|
-
},
|
|
536
|
-
{
|
|
537
|
-
"benchmark": "arc_ar",
|
|
538
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
539
|
-
},
|
|
540
|
-
{
|
|
541
|
-
"benchmark": "afrimmlu_direct_amh",
|
|
542
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
543
|
-
},
|
|
544
|
-
{
|
|
545
|
-
"benchmark": "m_mmlu",
|
|
546
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
547
|
-
},
|
|
548
|
-
{
|
|
549
|
-
"benchmark": "non_greedy_robustness_agieval_aqua_rat",
|
|
550
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
551
|
-
},
|
|
552
|
-
{
|
|
553
|
-
"benchmark": "prompt_robustness_agieval_aqua_rat",
|
|
554
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
555
|
-
},
|
|
556
|
-
{
|
|
557
|
-
"benchmark": "inverse_scaling_hindsight_neglect_10shot",
|
|
558
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
559
|
-
},
|
|
560
|
-
{
|
|
561
|
-
"benchmark": "mela",
|
|
562
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
563
|
-
},
|
|
564
|
-
{
|
|
565
|
-
"benchmark": "paws_ca",
|
|
566
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
567
|
-
},
|
|
568
|
-
{
|
|
569
|
-
"benchmark": "ja_leaderboard_jaqket_v2",
|
|
570
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
571
|
-
},
|
|
572
|
-
{
|
|
573
|
-
"benchmark": "super_glue-boolq-t5-prompt",
|
|
574
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
575
|
-
},
|
|
576
|
-
{
|
|
577
|
-
"benchmark": "multiple_choice",
|
|
578
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
579
|
-
},
|
|
580
|
-
{
|
|
581
|
-
"benchmark": "option_order_robustness_agieval_aqua_rat",
|
|
582
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
583
|
-
},
|
|
584
|
-
{
|
|
585
|
-
"benchmark": "phrases_ca-va",
|
|
586
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
587
|
-
},
|
|
588
|
-
{
|
|
589
|
-
"benchmark": "code2text_go",
|
|
590
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
591
|
-
},
|
|
592
|
-
{
|
|
593
|
-
"benchmark": "ethics_cm",
|
|
594
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
595
|
-
},
|
|
596
|
-
{
|
|
597
|
-
"benchmark": "cabreu",
|
|
598
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
599
|
-
},
|
|
600
|
-
{
|
|
601
|
-
"benchmark": "sycophancy",
|
|
602
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
603
|
-
},
|
|
604
|
-
{
|
|
605
|
-
"benchmark": "evalita_sp_sum_task_fp-small_p1",
|
|
606
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
607
|
-
},
|
|
608
|
-
{
|
|
609
|
-
"benchmark": "glianorex",
|
|
610
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
611
|
-
},
|
|
612
|
-
{
|
|
613
|
-
"benchmark": "flan_held_in",
|
|
614
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
615
|
-
},
|
|
616
|
-
{
|
|
617
|
-
"benchmark": "assin_entailment",
|
|
618
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
619
|
-
},
|
|
620
|
-
{
|
|
621
|
-
"benchmark": "gsm_plus",
|
|
622
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
623
|
-
},
|
|
624
|
-
{
|
|
625
|
-
"benchmark": "mnli",
|
|
626
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
627
|
-
},
|
|
628
|
-
{
|
|
629
|
-
"benchmark": "tinyTruthfulQA",
|
|
630
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
631
|
-
},
|
|
632
|
-
{
|
|
633
|
-
"benchmark": "multimedqa",
|
|
634
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
635
|
-
},
|
|
636
|
-
{
|
|
637
|
-
"benchmark": "openllm",
|
|
638
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
639
|
-
},
|
|
640
|
-
{
|
|
641
|
-
"benchmark": "pythia",
|
|
642
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
643
|
-
},
|
|
644
|
-
{
|
|
645
|
-
"benchmark": "t0_eval",
|
|
646
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
647
|
-
},
|
|
648
|
-
{
|
|
649
|
-
"benchmark": "Tag",
|
|
650
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
651
|
-
},
|
|
652
|
-
{
|
|
653
|
-
"benchmark": "basque-glue",
|
|
654
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
655
|
-
},
|
|
656
|
-
{
|
|
657
|
-
"benchmark": "chain_of_thought",
|
|
658
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
659
|
-
},
|
|
660
|
-
{
|
|
661
|
-
"benchmark": "freebase",
|
|
662
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
663
|
-
},
|
|
664
|
-
{
|
|
665
|
-
"benchmark": "gpt3_translation_benchmarks",
|
|
666
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
667
|
-
},
|
|
668
|
-
{
|
|
669
|
-
"benchmark": "iwslt2017",
|
|
670
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
671
|
-
},
|
|
672
|
-
{
|
|
673
|
-
"benchmark": "llama",
|
|
674
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
675
|
-
},
|
|
676
|
-
{
|
|
677
|
-
"benchmark": "self_consistency",
|
|
678
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
679
|
-
},
|
|
680
|
-
{
|
|
681
|
-
"benchmark": "super-glue-lm-eval-v1",
|
|
682
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
683
|
-
},
|
|
684
|
-
{
|
|
685
|
-
"benchmark": "super-glue-lm-eval-v1-seq2seq",
|
|
686
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
687
|
-
},
|
|
688
|
-
{
|
|
689
|
-
"benchmark": "super-glue-t5-prompt",
|
|
690
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
691
|
-
},
|
|
692
|
-
{
|
|
693
|
-
"benchmark": "wmt14",
|
|
694
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
695
|
-
},
|
|
696
|
-
{
|
|
697
|
-
"benchmark": "wmt14_en_fr",
|
|
698
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
699
|
-
},
|
|
700
|
-
{
|
|
701
|
-
"benchmark": "wmt14_fr_en",
|
|
702
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
703
|
-
},
|
|
704
|
-
{
|
|
705
|
-
"benchmark": "wmt16_de_en",
|
|
706
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
707
|
-
},
|
|
708
|
-
{
|
|
709
|
-
"benchmark": "wmt16_en_de",
|
|
710
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
711
|
-
},
|
|
712
|
-
{
|
|
713
|
-
"benchmark": "wmt16_en_ro",
|
|
714
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
715
|
-
},
|
|
716
|
-
{
|
|
717
|
-
"benchmark": "wmt16_ro_en",
|
|
718
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
719
|
-
},
|
|
720
|
-
{
|
|
721
|
-
"benchmark": "20_newsgroups",
|
|
722
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
723
|
-
},
|
|
724
|
-
{
|
|
725
|
-
"benchmark": "ag_news",
|
|
726
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
727
|
-
},
|
|
728
|
-
{
|
|
729
|
-
"benchmark": "anagrams1",
|
|
730
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
731
|
-
},
|
|
732
|
-
{
|
|
733
|
-
"benchmark": "anagrams2",
|
|
734
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
735
|
-
},
|
|
736
|
-
{
|
|
737
|
-
"benchmark": "argument_topic",
|
|
738
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
739
|
-
},
|
|
740
|
-
{
|
|
741
|
-
"benchmark": "atis",
|
|
742
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
743
|
-
},
|
|
744
|
-
{
|
|
745
|
-
"benchmark": "banking77",
|
|
746
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
747
|
-
},
|
|
748
|
-
{
|
|
749
|
-
"benchmark": "bec2016eu",
|
|
750
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
751
|
-
},
|
|
752
|
-
{
|
|
753
|
-
"benchmark": "bhtc_v2",
|
|
754
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
755
|
-
},
|
|
756
|
-
{
|
|
757
|
-
"benchmark": "boolq-seq2seq",
|
|
758
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
759
|
-
},
|
|
760
|
-
{
|
|
761
|
-
"benchmark": "catalanqa",
|
|
762
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
763
|
-
},
|
|
764
|
-
{
|
|
765
|
-
"benchmark": "catcola",
|
|
766
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
767
|
-
},
|
|
768
|
-
{
|
|
769
|
-
"benchmark": "claim_stance_topic",
|
|
770
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
771
|
-
},
|
|
772
|
-
{
|
|
773
|
-
"benchmark": "cnn_dailymail",
|
|
774
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
775
|
-
},
|
|
776
|
-
{
|
|
777
|
-
"benchmark": "cocoteros_es",
|
|
778
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
779
|
-
},
|
|
780
|
-
{
|
|
781
|
-
"benchmark": "coedit_gec",
|
|
782
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
783
|
-
},
|
|
784
|
-
{
|
|
785
|
-
"benchmark": "cola",
|
|
786
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
787
|
-
},
|
|
788
|
-
{
|
|
789
|
-
"benchmark": "coqcat",
|
|
790
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
791
|
-
},
|
|
792
|
-
{
|
|
793
|
-
"benchmark": "cycle_letters",
|
|
794
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
795
|
-
},
|
|
796
|
-
{
|
|
797
|
-
"benchmark": "dbpedia_14",
|
|
798
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
799
|
-
},
|
|
800
|
-
{
|
|
801
|
-
"benchmark": "doc_vqa",
|
|
802
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
803
|
-
},
|
|
804
|
-
{
|
|
805
|
-
"benchmark": "epec_koref_bin",
|
|
806
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
807
|
-
},
|
|
808
|
-
{
|
|
809
|
-
"benchmark": "escola",
|
|
810
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
811
|
-
},
|
|
812
|
-
{
|
|
813
|
-
"benchmark": "ethos_binary",
|
|
814
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
815
|
-
},
|
|
816
|
-
{
|
|
817
|
-
"benchmark": "financial_tweets",
|
|
818
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
819
|
-
},
|
|
820
|
-
{
|
|
821
|
-
"benchmark": "galcola",
|
|
822
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
823
|
-
},
|
|
824
|
-
{
|
|
825
|
-
"benchmark": "iwslt2017-ar-en",
|
|
826
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
827
|
-
},
|
|
828
|
-
{
|
|
829
|
-
"benchmark": "iwslt2017-en-ar",
|
|
830
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
831
|
-
},
|
|
832
|
-
{
|
|
833
|
-
"benchmark": "law_stack_exchange",
|
|
834
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
835
|
-
},
|
|
836
|
-
{
|
|
837
|
-
"benchmark": "ledgar",
|
|
838
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
839
|
-
},
|
|
840
|
-
{
|
|
841
|
-
"benchmark": "logieval",
|
|
842
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
843
|
-
},
|
|
844
|
-
{
|
|
845
|
-
"benchmark": "medical_abstracts",
|
|
846
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
847
|
-
},
|
|
848
|
-
{
|
|
849
|
-
"benchmark": "noticia",
|
|
850
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
851
|
-
},
|
|
852
|
-
{
|
|
853
|
-
"benchmark": "parafraseja",
|
|
854
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
855
|
-
},
|
|
856
|
-
{
|
|
857
|
-
"benchmark": "parafrases_gl",
|
|
858
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
859
|
-
},
|
|
860
|
-
{
|
|
861
|
-
"benchmark": "qnlieu",
|
|
862
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
863
|
-
},
|
|
864
|
-
{
|
|
865
|
-
"benchmark": "random_insertion",
|
|
866
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
867
|
-
},
|
|
868
|
-
{
|
|
869
|
-
"benchmark": "reversed_words",
|
|
870
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
871
|
-
},
|
|
872
|
-
{
|
|
873
|
-
"benchmark": "sglue_rte",
|
|
874
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
875
|
-
},
|
|
876
|
-
{
|
|
877
|
-
"benchmark": "stsb",
|
|
878
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
879
|
-
},
|
|
880
|
-
{
|
|
881
|
-
"benchmark": "summarization_gl",
|
|
882
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
883
|
-
},
|
|
884
|
-
{
|
|
885
|
-
"benchmark": "teca",
|
|
886
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
887
|
-
},
|
|
888
|
-
{
|
|
889
|
-
"benchmark": "tinyArc",
|
|
890
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
891
|
-
},
|
|
892
|
-
{
|
|
893
|
-
"benchmark": "tinyGSM8k",
|
|
894
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
895
|
-
},
|
|
896
|
-
{
|
|
897
|
-
"benchmark": "tinyHellaswag",
|
|
898
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
899
|
-
},
|
|
900
|
-
{
|
|
901
|
-
"benchmark": "tinyMMLU",
|
|
902
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
903
|
-
},
|
|
904
|
-
{
|
|
905
|
-
"benchmark": "tinyWinogrande",
|
|
906
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
907
|
-
},
|
|
908
|
-
{
|
|
909
|
-
"benchmark": "unfair_tos",
|
|
910
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
911
|
-
},
|
|
912
|
-
{
|
|
913
|
-
"benchmark": "vaxx_stance",
|
|
914
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
915
|
-
},
|
|
916
|
-
{
|
|
917
|
-
"benchmark": "wiceu",
|
|
918
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
919
|
-
},
|
|
920
|
-
{
|
|
921
|
-
"benchmark": "wmt-ro-en-t5-prompt",
|
|
922
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
923
|
-
},
|
|
924
|
-
{
|
|
925
|
-
"benchmark": "xlsum_es",
|
|
926
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
927
|
-
},
|
|
928
|
-
{
|
|
929
|
-
"benchmark": "xsum",
|
|
930
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
931
|
-
},
|
|
932
|
-
{
|
|
933
|
-
"benchmark": "yahoo_answers_topics",
|
|
934
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
935
|
-
},
|
|
936
|
-
{
|
|
937
|
-
"benchmark": "instructhumaneval",
|
|
938
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
939
|
-
},
|
|
940
|
-
{
|
|
941
|
-
"benchmark": "humanevalpack",
|
|
942
|
-
"error": "NOT RUN - Test suite stopped before reaching this benchmark"
|
|
943
|
-
}
|
|
944
|
-
],
|
|
945
|
-
"note": "Includes failures from tests 1-198 and all untested benchmarks (tests 200-318). Test 199 (tmmluplus) got stuck on network I/O."
|
|
946
|
-
}
|