wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/__init__.py +22 -6
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +22 -40
- wisent/core/activations/activations_collector.py +145 -373
- wisent/core/activations/classifier_inference_strategy.py +195 -0
- wisent/core/activations/core/atoms.py +8 -92
- wisent/core/activations/extraction_strategy.py +480 -0
- wisent/core/agent/diagnose/response_diagnostics.py +3 -3
- wisent/core/agent/diagnose.py +3 -3
- wisent/core/autonomous_agent.py +2 -2
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +25 -31
- wisent/core/cli/agent/evaluate_response.py +18 -20
- wisent/core/cli/agent/train_classifier.py +36 -26
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +470 -0
- wisent/core/cli/create_steering_vector.py +19 -9
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/generate_vector_from_task.py +4 -0
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +13 -37
- wisent/core/cli/method_optimizer.py +860 -0
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize.py +44 -5
- wisent/core/cli/optimize_classification.py +5 -6
- wisent/core/cli/optimize_sample_size.py +9 -23
- wisent/core/cli/optimize_steering.py +433 -159
- wisent/core/cli/optimize_weights.py +67 -7
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +8 -7
- wisent/core/cli/steering_search_space.py +20 -15
- wisent/core/cli/tasks.py +31 -117
- wisent/core/cli/train_unified_goodness.py +18 -19
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1582 -177
- wisent/core/contrastive_pairs/diagnostics/linearity.py +70 -80
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +11 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
- wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
- wisent/core/evaluators/personalization/coherence.py +46 -0
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +14 -14
- wisent/core/lm_eval_harness_ground_truth.py +7 -11
- wisent/core/main.py +6 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +9 -8
- wisent/core/opti/methods/opti_weights.py +29 -2
- wisent/core/optuna/classifier/activation_generator.py +14 -12
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/optuna/steering/steering_optimization.py +14 -9
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +22 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +16 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
- wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
- wisent/core/parser_arguments/tasks_parser.py +7 -19
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/core/atoms.py +1 -2
- wisent/core/steering_methods/methods/caa.py +1 -1
- wisent/core/steering_methods/methods/hyperplane.py +75 -0
- wisent/core/steering_methods/methods/prism.py +1 -2
- wisent/core/steering_methods/methods/pulse.py +39 -8
- wisent/core/steering_methods/methods/titan.py +59 -14
- wisent/core/steering_methods/registry.py +52 -12
- wisent/core/steering_optimizer.py +15 -15
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +11 -20
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/scripts/run_quality_metrics_sweep.sh +22 -27
- wisent/tests/test_aggregation_geometry.py +236 -0
- wisent/tests/test_detector_accuracy.py +163 -0
- wisent/tests/test_geometry_exhaustive.py +1202 -0
- wisent/tests/visualize_geometry.py +255 -61
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/RECORD +376 -974
- wisent/core/activations/prompt_construction_strategy.py +0 -47
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
- wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
- wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
- wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
- wisent/examples/scripts/results/benchmark_tags.json +0 -917
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
- wisent/examples/scripts/results/failing_benchmarks.json +0 -946
- wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
- wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
- wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
- wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
- wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
- wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
- wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
- wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
- wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
- wisent/examples/scripts/results/test_aime_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
- wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_anli_pairs.json +0 -8
- wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
- wisent/examples/scripts/results/test_apps_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
- wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
- wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
- wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
- wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/results/test_atis_pairs.json +0 -8
- wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babi_pairs.json +0 -8
- wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
- wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
- wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
- wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
- wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
- wisent/examples/scripts/results/test_c4_pairs.json +0 -8
- wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cb_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
- wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
- wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cola_pairs.json +0 -8
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
- wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
- wisent/examples/scripts/results/test_conala_pairs.json +0 -8
- wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_concode_pairs.json +0 -8
- wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copa_pairs.json +0 -8
- wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
- wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
- wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
- wisent/examples/scripts/results/test_drop_pairs.json +0 -8
- wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_escola_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
- wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
- wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fda_pairs.json +0 -8
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_fld_pairs.json +0 -8
- wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
- wisent/examples/scripts/results/test_flores_pairs.json +0 -14
- wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
- wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
- wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
- wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
- wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
- wisent/examples/scripts/results/test_glue_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
- wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
- wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
- wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
- wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
- wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
- wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
- wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
- wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
- wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
- wisent/examples/scripts/results/test_libra_pairs.json +0 -14
- wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
- wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
- wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
- wisent/examples/scripts/results/test_llama_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
- wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
- wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math500_pairs.json +0 -8
- wisent/examples/scripts/results/test_math_evaluation.json +0 -30
- wisent/examples/scripts/results/test_math_pairs.json +0 -8
- wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
- wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
- wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mela_pairs.json +0 -14
- wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
- wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
- wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
- wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
- wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
- wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
- wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
- wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
- wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
- wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
- wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
- wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
- wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
- wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
- wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
- wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
- wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
- wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
- wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
- wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
- wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
- wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
- wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
- wisent/examples/scripts/results/test_prost_pairs.json +0 -8
- wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
- wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
- wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
- wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
- wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
- wisent/examples/scripts/results/test_race_evaluation.json +0 -30
- wisent/examples/scripts/results/test_race_pairs.json +0 -8
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
- wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
- wisent/examples/scripts/results/test_recode_pairs.json +0 -8
- wisent/examples/scripts/results/test_record_evaluation.json +0 -30
- wisent/examples/scripts/results/test_record_pairs.json +0 -8
- wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
- wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
- wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
- wisent/examples/scripts/results/test_score_evaluation.json +0 -51
- wisent/examples/scripts/results/test_score_pairs.json +0 -14
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
- wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
- wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
- wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
- wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
- wisent/examples/scripts/results/test_swag_pairs.json +0 -8
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
- wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
- wisent/examples/scripts/results/test_translation_pairs.json +0 -14
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
- wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
- wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
- wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
- wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
- wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
- wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
- wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
- wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
- wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
- wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
- wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
- wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
- wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
- wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
- wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
- wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
- wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -1,2781 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"Tag": {
|
|
3
|
-
"evaluator": null,
|
|
4
|
-
"extractor_location": "huggingface_pairs",
|
|
5
|
-
"extractor_file": "tag",
|
|
6
|
-
"benchmark_type": "other",
|
|
7
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
8
|
-
},
|
|
9
|
-
"aclue": {
|
|
10
|
-
"evaluator": "log_likelihoods",
|
|
11
|
-
"extractor_location": "lm_eval_pairs",
|
|
12
|
-
"extractor_file": "aclue",
|
|
13
|
-
"benchmark_type": "other",
|
|
14
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
15
|
-
},
|
|
16
|
-
"acp_bench": {
|
|
17
|
-
"evaluator": "log_likelihoods",
|
|
18
|
-
"extractor_location": "lm_eval_pairs",
|
|
19
|
-
"extractor_file": "acp_bench",
|
|
20
|
-
"benchmark_type": "other",
|
|
21
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
22
|
-
},
|
|
23
|
-
"acp_bench_hard": {
|
|
24
|
-
"evaluator": "generation",
|
|
25
|
-
"extractor_location": "lm_eval_pairs",
|
|
26
|
-
"extractor_file": "acp_bench_hard",
|
|
27
|
-
"benchmark_type": "other",
|
|
28
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
29
|
-
},
|
|
30
|
-
"advanced": {
|
|
31
|
-
"evaluator": "log_likelihoods",
|
|
32
|
-
"extractor_location": "lm_eval_pairs",
|
|
33
|
-
"extractor_file": "advanced",
|
|
34
|
-
"benchmark_type": "other",
|
|
35
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
36
|
-
},
|
|
37
|
-
"aexams": {
|
|
38
|
-
"evaluator": "log_likelihoods",
|
|
39
|
-
"extractor_location": "lm_eval_pairs",
|
|
40
|
-
"extractor_file": "aexams",
|
|
41
|
-
"benchmark_type": "knowledge",
|
|
42
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
43
|
-
},
|
|
44
|
-
"afrimgsm": {
|
|
45
|
-
"evaluator": "generation",
|
|
46
|
-
"extractor_location": "lm_eval_pairs",
|
|
47
|
-
"extractor_file": "afrimgsm",
|
|
48
|
-
"benchmark_type": "mathematics",
|
|
49
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
50
|
-
},
|
|
51
|
-
"afrimmlu": {
|
|
52
|
-
"evaluator": "log_likelihoods",
|
|
53
|
-
"extractor_location": "lm_eval_pairs",
|
|
54
|
-
"extractor_file": "afrimmlu",
|
|
55
|
-
"benchmark_type": "knowledge",
|
|
56
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
57
|
-
},
|
|
58
|
-
"afrixnli": {
|
|
59
|
-
"evaluator": "log_likelihoods",
|
|
60
|
-
"extractor_location": "lm_eval_pairs",
|
|
61
|
-
"extractor_file": "afrixnli",
|
|
62
|
-
"benchmark_type": "other",
|
|
63
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
64
|
-
},
|
|
65
|
-
"ag": {
|
|
66
|
-
"evaluator": "exact_match",
|
|
67
|
-
"extractor_location": "lm_eval_pairs",
|
|
68
|
-
"extractor_file": "ag",
|
|
69
|
-
"benchmark_type": "other",
|
|
70
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
71
|
-
},
|
|
72
|
-
"agieval": {
|
|
73
|
-
"evaluator": "exact_match",
|
|
74
|
-
"extractor_location": "lm_eval_pairs",
|
|
75
|
-
"extractor_file": "agieval",
|
|
76
|
-
"benchmark_type": "other",
|
|
77
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
78
|
-
},
|
|
79
|
-
"ai2_arc": {
|
|
80
|
-
"evaluator": "log_likelihoods",
|
|
81
|
-
"extractor_location": "lm_eval_pairs",
|
|
82
|
-
"extractor_file": "ai2_arc",
|
|
83
|
-
"benchmark_type": "knowledge",
|
|
84
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
85
|
-
},
|
|
86
|
-
"aime": {
|
|
87
|
-
"evaluator": null,
|
|
88
|
-
"extractor_location": "huggingface_pairs",
|
|
89
|
-
"extractor_file": "aime",
|
|
90
|
-
"benchmark_type": "mathematics",
|
|
91
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
92
|
-
},
|
|
93
|
-
"aime2024": {
|
|
94
|
-
"evaluator": null,
|
|
95
|
-
"extractor_location": "huggingface_pairs",
|
|
96
|
-
"extractor_file": "aime",
|
|
97
|
-
"benchmark_type": "mathematics",
|
|
98
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
99
|
-
},
|
|
100
|
-
"aime2025": {
|
|
101
|
-
"evaluator": null,
|
|
102
|
-
"extractor_location": "huggingface_pairs",
|
|
103
|
-
"extractor_file": "aime",
|
|
104
|
-
"benchmark_type": "mathematics",
|
|
105
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
106
|
-
},
|
|
107
|
-
"anagrams1": {
|
|
108
|
-
"evaluator": "exact_match",
|
|
109
|
-
"extractor_location": "lm_eval_pairs",
|
|
110
|
-
"extractor_file": "anagrams1",
|
|
111
|
-
"benchmark_type": "other",
|
|
112
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
113
|
-
},
|
|
114
|
-
"anagrams2": {
|
|
115
|
-
"evaluator": "exact_match",
|
|
116
|
-
"extractor_location": "lm_eval_pairs",
|
|
117
|
-
"extractor_file": "anagrams2",
|
|
118
|
-
"benchmark_type": "other",
|
|
119
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
120
|
-
},
|
|
121
|
-
"anli": {
|
|
122
|
-
"evaluator": "log_likelihoods",
|
|
123
|
-
"extractor_location": "lm_eval_pairs",
|
|
124
|
-
"extractor_file": "anli",
|
|
125
|
-
"benchmark_type": "other",
|
|
126
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
127
|
-
},
|
|
128
|
-
"apps": {
|
|
129
|
-
"evaluator": null,
|
|
130
|
-
"extractor_location": "huggingface_pairs",
|
|
131
|
-
"extractor_file": "apps",
|
|
132
|
-
"benchmark_type": "coding",
|
|
133
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
134
|
-
},
|
|
135
|
-
"arabculture": {
|
|
136
|
-
"evaluator": "log_likelihoods",
|
|
137
|
-
"extractor_location": "lm_eval_pairs",
|
|
138
|
-
"extractor_file": "arabculture",
|
|
139
|
-
"benchmark_type": "other",
|
|
140
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
141
|
-
},
|
|
142
|
-
"arabic": {
|
|
143
|
-
"evaluator": "log_likelihoods",
|
|
144
|
-
"extractor_location": "lm_eval_pairs",
|
|
145
|
-
"extractor_file": "arabic",
|
|
146
|
-
"benchmark_type": "other",
|
|
147
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
148
|
-
},
|
|
149
|
-
"arabic_leaderboard_complete": {
|
|
150
|
-
"evaluator": "log_likelihoods",
|
|
151
|
-
"extractor_location": "lm_eval_pairs",
|
|
152
|
-
"extractor_file": "arabic_leaderboard_complete",
|
|
153
|
-
"benchmark_type": "other",
|
|
154
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
155
|
-
},
|
|
156
|
-
"arabic_leaderboard_light": {
|
|
157
|
-
"evaluator": "log_likelihoods",
|
|
158
|
-
"extractor_location": "lm_eval_pairs",
|
|
159
|
-
"extractor_file": "arabic_leaderboard_light",
|
|
160
|
-
"benchmark_type": "other",
|
|
161
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
162
|
-
},
|
|
163
|
-
"arabicmmlu": {
|
|
164
|
-
"evaluator": "log_likelihoods",
|
|
165
|
-
"extractor_location": "lm_eval_pairs",
|
|
166
|
-
"extractor_file": "arabicmmlu",
|
|
167
|
-
"benchmark_type": "knowledge",
|
|
168
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
169
|
-
},
|
|
170
|
-
"aradice": {
|
|
171
|
-
"evaluator": "log_likelihoods",
|
|
172
|
-
"extractor_location": "lm_eval_pairs",
|
|
173
|
-
"extractor_file": "aradice",
|
|
174
|
-
"benchmark_type": "other",
|
|
175
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
176
|
-
},
|
|
177
|
-
"arc": {
|
|
178
|
-
"evaluator": "log_likelihoods",
|
|
179
|
-
"extractor_location": "lm_eval_pairs",
|
|
180
|
-
"extractor_file": "arc",
|
|
181
|
-
"benchmark_type": "knowledge",
|
|
182
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
183
|
-
},
|
|
184
|
-
"arc_challenge": {
|
|
185
|
-
"evaluator": "log_likelihoods",
|
|
186
|
-
"extractor_location": "lm_eval_pairs",
|
|
187
|
-
"extractor_file": "arc_challenge",
|
|
188
|
-
"benchmark_type": "knowledge",
|
|
189
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
190
|
-
},
|
|
191
|
-
"arc_easy": {
|
|
192
|
-
"evaluator": "log_likelihoods",
|
|
193
|
-
"extractor_location": "lm_eval_pairs",
|
|
194
|
-
"extractor_file": "arc_easy",
|
|
195
|
-
"benchmark_type": "knowledge",
|
|
196
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
197
|
-
},
|
|
198
|
-
"argument": {
|
|
199
|
-
"evaluator": "exact_match",
|
|
200
|
-
"extractor_location": "lm_eval_pairs",
|
|
201
|
-
"extractor_file": "argument",
|
|
202
|
-
"benchmark_type": "other",
|
|
203
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
204
|
-
},
|
|
205
|
-
"arithmetic": {
|
|
206
|
-
"evaluator": "exact_match",
|
|
207
|
-
"extractor_location": "lm_eval_pairs",
|
|
208
|
-
"extractor_file": "arithmetic",
|
|
209
|
-
"benchmark_type": "mathematics",
|
|
210
|
-
"explanation": "Text comparison (WARNING: should use execution for mathematics)"
|
|
211
|
-
},
|
|
212
|
-
"asdiv": {
|
|
213
|
-
"evaluator": "exact_match",
|
|
214
|
-
"extractor_location": "lm_eval_pairs",
|
|
215
|
-
"extractor_file": "asdiv",
|
|
216
|
-
"benchmark_type": "mathematics",
|
|
217
|
-
"explanation": "Text comparison (WARNING: should use execution for mathematics)"
|
|
218
|
-
},
|
|
219
|
-
"asdiv_cot_llama": {
|
|
220
|
-
"evaluator": null,
|
|
221
|
-
"extractor_location": "huggingface_pairs",
|
|
222
|
-
"extractor_file": "math",
|
|
223
|
-
"benchmark_type": "mathematics",
|
|
224
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
225
|
-
},
|
|
226
|
-
"assin": {
|
|
227
|
-
"evaluator": "log_likelihoods",
|
|
228
|
-
"extractor_location": "lm_eval_pairs",
|
|
229
|
-
"extractor_file": "assin",
|
|
230
|
-
"benchmark_type": "other",
|
|
231
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
232
|
-
},
|
|
233
|
-
"atis": {
|
|
234
|
-
"evaluator": "generation",
|
|
235
|
-
"extractor_location": "huggingface_pairs",
|
|
236
|
-
"extractor_file": "atis",
|
|
237
|
-
"benchmark_type": "other",
|
|
238
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
239
|
-
},
|
|
240
|
-
"babi": {
|
|
241
|
-
"evaluator": "generation",
|
|
242
|
-
"extractor_location": "lm_eval_pairs",
|
|
243
|
-
"extractor_file": "babi",
|
|
244
|
-
"benchmark_type": "other",
|
|
245
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
246
|
-
},
|
|
247
|
-
"babilong": {
|
|
248
|
-
"evaluator": "generation",
|
|
249
|
-
"extractor_location": "huggingface_pairs",
|
|
250
|
-
"extractor_file": "babilong",
|
|
251
|
-
"benchmark_type": "other",
|
|
252
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
253
|
-
},
|
|
254
|
-
"bangla_mmlu": {
|
|
255
|
-
"evaluator": "log_likelihoods",
|
|
256
|
-
"extractor_location": "huggingface_pairs",
|
|
257
|
-
"extractor_file": "bangla_mmlu",
|
|
258
|
-
"benchmark_type": "knowledge",
|
|
259
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
260
|
-
},
|
|
261
|
-
"banking77": {
|
|
262
|
-
"evaluator": "exact_match",
|
|
263
|
-
"extractor_location": "huggingface_pairs",
|
|
264
|
-
"extractor_file": "banking77",
|
|
265
|
-
"benchmark_type": "other",
|
|
266
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
267
|
-
},
|
|
268
|
-
"basque_bench": {
|
|
269
|
-
"evaluator": "log_likelihoods",
|
|
270
|
-
"extractor_location": "lm_eval_pairs",
|
|
271
|
-
"extractor_file": "basque_bench",
|
|
272
|
-
"benchmark_type": "other",
|
|
273
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
274
|
-
},
|
|
275
|
-
"basque_glue": {
|
|
276
|
-
"evaluator": "log_likelihoods",
|
|
277
|
-
"extractor_location": "lm_eval_pairs",
|
|
278
|
-
"extractor_file": "basque_glue",
|
|
279
|
-
"benchmark_type": "other",
|
|
280
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
281
|
-
},
|
|
282
|
-
"basqueglue": {
|
|
283
|
-
"evaluator": "log_likelihoods",
|
|
284
|
-
"extractor_location": "huggingface_pairs",
|
|
285
|
-
"extractor_file": "basqueglue",
|
|
286
|
-
"benchmark_type": "other",
|
|
287
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
288
|
-
},
|
|
289
|
-
"bbh": {
|
|
290
|
-
"evaluator": "exact_match",
|
|
291
|
-
"extractor_location": "lm_eval_pairs",
|
|
292
|
-
"extractor_file": "bbh",
|
|
293
|
-
"benchmark_type": "other",
|
|
294
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
295
|
-
},
|
|
296
|
-
"bbq": {
|
|
297
|
-
"evaluator": "log_likelihoods",
|
|
298
|
-
"extractor_location": "lm_eval_pairs",
|
|
299
|
-
"extractor_file": "bbq",
|
|
300
|
-
"benchmark_type": "other",
|
|
301
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
302
|
-
},
|
|
303
|
-
"bec2016eu": {
|
|
304
|
-
"evaluator": "log_likelihoods",
|
|
305
|
-
"extractor_location": "huggingface_pairs",
|
|
306
|
-
"extractor_file": "bec2016eu",
|
|
307
|
-
"benchmark_type": "other",
|
|
308
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
309
|
-
},
|
|
310
|
-
"belebele": {
|
|
311
|
-
"evaluator": "log_likelihoods",
|
|
312
|
-
"extractor_location": "lm_eval_pairs",
|
|
313
|
-
"extractor_file": "belebele",
|
|
314
|
-
"benchmark_type": "other",
|
|
315
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
316
|
-
},
|
|
317
|
-
"benchmarks": {
|
|
318
|
-
"evaluator": "log_likelihoods",
|
|
319
|
-
"extractor_location": "lm_eval_pairs",
|
|
320
|
-
"extractor_file": "benchmarks",
|
|
321
|
-
"benchmark_type": "other",
|
|
322
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
323
|
-
},
|
|
324
|
-
"bertaqa": {
|
|
325
|
-
"evaluator": "log_likelihoods",
|
|
326
|
-
"extractor_location": "lm_eval_pairs",
|
|
327
|
-
"extractor_file": "bertaqa",
|
|
328
|
-
"benchmark_type": "question_answering",
|
|
329
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
330
|
-
},
|
|
331
|
-
"bhs": {
|
|
332
|
-
"evaluator": "log_likelihoods",
|
|
333
|
-
"extractor_location": "lm_eval_pairs",
|
|
334
|
-
"extractor_file": "bhs",
|
|
335
|
-
"benchmark_type": "other",
|
|
336
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
337
|
-
},
|
|
338
|
-
"bhtc": {
|
|
339
|
-
"evaluator": "log_likelihoods",
|
|
340
|
-
"extractor_location": "lm_eval_pairs",
|
|
341
|
-
"extractor_file": "bhtc",
|
|
342
|
-
"benchmark_type": "other",
|
|
343
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
344
|
-
},
|
|
345
|
-
"bigbench": {
|
|
346
|
-
"evaluator": "exact_match",
|
|
347
|
-
"extractor_location": "lm_eval_pairs",
|
|
348
|
-
"extractor_file": "bigbench",
|
|
349
|
-
"benchmark_type": "other",
|
|
350
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
351
|
-
},
|
|
352
|
-
"blimp": {
|
|
353
|
-
"evaluator": "log_likelihoods",
|
|
354
|
-
"extractor_location": "lm_eval_pairs",
|
|
355
|
-
"extractor_file": "blimp",
|
|
356
|
-
"benchmark_type": "other",
|
|
357
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
358
|
-
},
|
|
359
|
-
"blimp_nl": {
|
|
360
|
-
"evaluator": null,
|
|
361
|
-
"extractor_location": "lm_eval_pairs",
|
|
362
|
-
"extractor_file": "blimp_nl",
|
|
363
|
-
"benchmark_type": "other",
|
|
364
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
365
|
-
},
|
|
366
|
-
"boolq": {
|
|
367
|
-
"evaluator": "log_likelihoods",
|
|
368
|
-
"extractor_location": "huggingface_pairs",
|
|
369
|
-
"extractor_file": "boolq",
|
|
370
|
-
"benchmark_type": "other",
|
|
371
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
372
|
-
},
|
|
373
|
-
"boolq_seq2seq": {
|
|
374
|
-
"evaluator": null,
|
|
375
|
-
"extractor_location": "huggingface_pairs",
|
|
376
|
-
"extractor_file": "boolq_seq2seq",
|
|
377
|
-
"benchmark_type": "other",
|
|
378
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
379
|
-
},
|
|
380
|
-
"c4": {
|
|
381
|
-
"evaluator": null,
|
|
382
|
-
"extractor_location": "lm_eval_pairs",
|
|
383
|
-
"extractor_file": "c4",
|
|
384
|
-
"benchmark_type": "other",
|
|
385
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
386
|
-
},
|
|
387
|
-
"cabbq": {
|
|
388
|
-
"evaluator": null,
|
|
389
|
-
"extractor_location": "lm_eval_pairs",
|
|
390
|
-
"extractor_file": "cabbq",
|
|
391
|
-
"benchmark_type": "other",
|
|
392
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
393
|
-
},
|
|
394
|
-
"cabreu": {
|
|
395
|
-
"evaluator": "log_likelihoods",
|
|
396
|
-
"extractor_location": "lm_eval_pairs",
|
|
397
|
-
"extractor_file": "cabreu",
|
|
398
|
-
"benchmark_type": "other",
|
|
399
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
400
|
-
},
|
|
401
|
-
"careqa": {
|
|
402
|
-
"evaluator": "log_likelihoods",
|
|
403
|
-
"extractor_location": "lm_eval_pairs",
|
|
404
|
-
"extractor_file": "careqa",
|
|
405
|
-
"benchmark_type": "question_answering",
|
|
406
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
407
|
-
},
|
|
408
|
-
"catalan_bench": {
|
|
409
|
-
"evaluator": "log_likelihoods",
|
|
410
|
-
"extractor_location": "lm_eval_pairs",
|
|
411
|
-
"extractor_file": "catalan_bench",
|
|
412
|
-
"benchmark_type": "other",
|
|
413
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
414
|
-
},
|
|
415
|
-
"catalanqa": {
|
|
416
|
-
"evaluator": "log_likelihoods",
|
|
417
|
-
"extractor_location": "lm_eval_pairs",
|
|
418
|
-
"extractor_file": "catalanqa",
|
|
419
|
-
"benchmark_type": "question_answering",
|
|
420
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
421
|
-
},
|
|
422
|
-
"catcola": {
|
|
423
|
-
"evaluator": "log_likelihoods",
|
|
424
|
-
"extractor_location": "lm_eval_pairs",
|
|
425
|
-
"extractor_file": "catcola",
|
|
426
|
-
"benchmark_type": "other",
|
|
427
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
428
|
-
},
|
|
429
|
-
"cb": {
|
|
430
|
-
"evaluator": null,
|
|
431
|
-
"extractor_location": "huggingface_pairs",
|
|
432
|
-
"extractor_file": "cb",
|
|
433
|
-
"benchmark_type": "other",
|
|
434
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
435
|
-
},
|
|
436
|
-
"ceval": {
|
|
437
|
-
"evaluator": "log_likelihoods",
|
|
438
|
-
"extractor_location": "lm_eval_pairs",
|
|
439
|
-
"extractor_file": "ceval",
|
|
440
|
-
"benchmark_type": "other",
|
|
441
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
442
|
-
},
|
|
443
|
-
"ceval_valid": {
|
|
444
|
-
"evaluator": "log_likelihoods",
|
|
445
|
-
"extractor_location": "lm_eval_pairs",
|
|
446
|
-
"extractor_file": "ceval_valid",
|
|
447
|
-
"benchmark_type": "other",
|
|
448
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
449
|
-
},
|
|
450
|
-
"chain": {
|
|
451
|
-
"evaluator": "log_likelihoods",
|
|
452
|
-
"extractor_location": "lm_eval_pairs",
|
|
453
|
-
"extractor_file": "chain",
|
|
454
|
-
"benchmark_type": "other",
|
|
455
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
456
|
-
},
|
|
457
|
-
"chain_of_thought": {
|
|
458
|
-
"evaluator": null,
|
|
459
|
-
"extractor_location": "huggingface_pairs",
|
|
460
|
-
"extractor_file": "math",
|
|
461
|
-
"benchmark_type": "other",
|
|
462
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
463
|
-
},
|
|
464
|
-
"chartqa": {
|
|
465
|
-
"evaluator": "generation",
|
|
466
|
-
"extractor_location": "lm_eval_pairs",
|
|
467
|
-
"extractor_file": "chartqa",
|
|
468
|
-
"benchmark_type": "question_answering",
|
|
469
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
470
|
-
},
|
|
471
|
-
"claim": {
|
|
472
|
-
"evaluator": "log_likelihoods",
|
|
473
|
-
"extractor_location": "lm_eval_pairs",
|
|
474
|
-
"extractor_file": "claim",
|
|
475
|
-
"benchmark_type": "other",
|
|
476
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
477
|
-
},
|
|
478
|
-
"click": {
|
|
479
|
-
"evaluator": null,
|
|
480
|
-
"extractor_location": "lm_eval_pairs",
|
|
481
|
-
"extractor_file": "click",
|
|
482
|
-
"benchmark_type": "other",
|
|
483
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
484
|
-
},
|
|
485
|
-
"cmmlu": {
|
|
486
|
-
"evaluator": null,
|
|
487
|
-
"extractor_location": "lm_eval_pairs",
|
|
488
|
-
"extractor_file": "cmmlu",
|
|
489
|
-
"benchmark_type": "knowledge",
|
|
490
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
491
|
-
},
|
|
492
|
-
"cnn": {
|
|
493
|
-
"evaluator": "exact_match",
|
|
494
|
-
"extractor_location": "lm_eval_pairs",
|
|
495
|
-
"extractor_file": "cnn",
|
|
496
|
-
"benchmark_type": "other",
|
|
497
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
498
|
-
},
|
|
499
|
-
"cocoteros": {
|
|
500
|
-
"evaluator": "generation",
|
|
501
|
-
"extractor_location": "lm_eval_pairs",
|
|
502
|
-
"extractor_file": "cocoteros",
|
|
503
|
-
"benchmark_type": "other",
|
|
504
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
505
|
-
},
|
|
506
|
-
"code2text": {
|
|
507
|
-
"evaluator": "generation",
|
|
508
|
-
"extractor_location": "lm_eval_pairs",
|
|
509
|
-
"extractor_file": "code2text",
|
|
510
|
-
"benchmark_type": "coding",
|
|
511
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
512
|
-
},
|
|
513
|
-
"code_x_glue": {
|
|
514
|
-
"evaluator": "generation",
|
|
515
|
-
"extractor_location": "lm_eval_pairs",
|
|
516
|
-
"extractor_file": "code_x_glue",
|
|
517
|
-
"benchmark_type": "coding",
|
|
518
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
519
|
-
},
|
|
520
|
-
"codexglue": {
|
|
521
|
-
"evaluator": null,
|
|
522
|
-
"extractor_location": "huggingface_pairs",
|
|
523
|
-
"extractor_file": "codexglue",
|
|
524
|
-
"benchmark_type": "coding",
|
|
525
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
526
|
-
},
|
|
527
|
-
"codexglue_code_to_text_go": {
|
|
528
|
-
"evaluator": null,
|
|
529
|
-
"extractor_location": "huggingface_pairs",
|
|
530
|
-
"extractor_file": "codexglue",
|
|
531
|
-
"benchmark_type": "coding",
|
|
532
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
533
|
-
},
|
|
534
|
-
"codexglue_code_to_text_java": {
|
|
535
|
-
"evaluator": null,
|
|
536
|
-
"extractor_location": "huggingface_pairs",
|
|
537
|
-
"extractor_file": "codexglue",
|
|
538
|
-
"benchmark_type": "coding",
|
|
539
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
540
|
-
},
|
|
541
|
-
"codexglue_code_to_text_javascript": {
|
|
542
|
-
"evaluator": null,
|
|
543
|
-
"extractor_location": "huggingface_pairs",
|
|
544
|
-
"extractor_file": "codexglue",
|
|
545
|
-
"benchmark_type": "coding",
|
|
546
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
547
|
-
},
|
|
548
|
-
"codexglue_code_to_text_php": {
|
|
549
|
-
"evaluator": null,
|
|
550
|
-
"extractor_location": "huggingface_pairs",
|
|
551
|
-
"extractor_file": "codexglue",
|
|
552
|
-
"benchmark_type": "coding",
|
|
553
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
554
|
-
},
|
|
555
|
-
"codexglue_code_to_text_python": {
|
|
556
|
-
"evaluator": null,
|
|
557
|
-
"extractor_location": "huggingface_pairs",
|
|
558
|
-
"extractor_file": "codexglue",
|
|
559
|
-
"benchmark_type": "coding",
|
|
560
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
561
|
-
},
|
|
562
|
-
"codexglue_code_to_text_ruby": {
|
|
563
|
-
"evaluator": null,
|
|
564
|
-
"extractor_location": "huggingface_pairs",
|
|
565
|
-
"extractor_file": "codexglue",
|
|
566
|
-
"benchmark_type": "coding",
|
|
567
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
568
|
-
},
|
|
569
|
-
"coedit": {
|
|
570
|
-
"evaluator": "generation",
|
|
571
|
-
"extractor_location": "lm_eval_pairs",
|
|
572
|
-
"extractor_file": "coedit",
|
|
573
|
-
"benchmark_type": "other",
|
|
574
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
575
|
-
},
|
|
576
|
-
"cola": {
|
|
577
|
-
"evaluator": "log_likelihoods",
|
|
578
|
-
"extractor_location": "lm_eval_pairs",
|
|
579
|
-
"extractor_file": "cola",
|
|
580
|
-
"benchmark_type": "other",
|
|
581
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
582
|
-
},
|
|
583
|
-
"commonsense": {
|
|
584
|
-
"evaluator": null,
|
|
585
|
-
"extractor_location": "lm_eval_pairs",
|
|
586
|
-
"extractor_file": "commonsense",
|
|
587
|
-
"benchmark_type": "other",
|
|
588
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
589
|
-
},
|
|
590
|
-
"commonsense_qa": {
|
|
591
|
-
"evaluator": "log_likelihoods",
|
|
592
|
-
"extractor_location": "lm_eval_pairs",
|
|
593
|
-
"extractor_file": "commonsense_qa",
|
|
594
|
-
"benchmark_type": "question_answering",
|
|
595
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
596
|
-
},
|
|
597
|
-
"conala": {
|
|
598
|
-
"evaluator": null,
|
|
599
|
-
"extractor_location": "huggingface_pairs",
|
|
600
|
-
"extractor_file": "conala",
|
|
601
|
-
"benchmark_type": "coding",
|
|
602
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
603
|
-
},
|
|
604
|
-
"concode": {
|
|
605
|
-
"evaluator": null,
|
|
606
|
-
"extractor_location": "huggingface_pairs",
|
|
607
|
-
"extractor_file": "concode",
|
|
608
|
-
"benchmark_type": "coding",
|
|
609
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
610
|
-
},
|
|
611
|
-
"copa": {
|
|
612
|
-
"evaluator": null,
|
|
613
|
-
"extractor_location": "huggingface_pairs",
|
|
614
|
-
"extractor_file": "copa",
|
|
615
|
-
"benchmark_type": "other",
|
|
616
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
617
|
-
},
|
|
618
|
-
"copal_id": {
|
|
619
|
-
"evaluator": "log_likelihoods",
|
|
620
|
-
"extractor_location": "lm_eval_pairs",
|
|
621
|
-
"extractor_file": "copal_id",
|
|
622
|
-
"benchmark_type": "other",
|
|
623
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
624
|
-
},
|
|
625
|
-
"coqa": {
|
|
626
|
-
"evaluator": null,
|
|
627
|
-
"extractor_location": "lm_eval_pairs",
|
|
628
|
-
"extractor_file": "coqa",
|
|
629
|
-
"benchmark_type": "question_answering",
|
|
630
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
631
|
-
},
|
|
632
|
-
"coqcat": {
|
|
633
|
-
"evaluator": "generation",
|
|
634
|
-
"extractor_location": "lm_eval_pairs",
|
|
635
|
-
"extractor_file": "coqcat",
|
|
636
|
-
"benchmark_type": "other",
|
|
637
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
638
|
-
},
|
|
639
|
-
"crows_pairs": {
|
|
640
|
-
"evaluator": "log_likelihoods",
|
|
641
|
-
"extractor_location": "lm_eval_pairs",
|
|
642
|
-
"extractor_file": "crows_pairs",
|
|
643
|
-
"benchmark_type": "other",
|
|
644
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
645
|
-
},
|
|
646
|
-
"csatqa": {
|
|
647
|
-
"evaluator": null,
|
|
648
|
-
"extractor_location": "lm_eval_pairs",
|
|
649
|
-
"extractor_file": "csatqa",
|
|
650
|
-
"benchmark_type": "question_answering",
|
|
651
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
652
|
-
},
|
|
653
|
-
"cycle": {
|
|
654
|
-
"evaluator": null,
|
|
655
|
-
"extractor_location": "lm_eval_pairs",
|
|
656
|
-
"extractor_file": "cycle",
|
|
657
|
-
"benchmark_type": "other",
|
|
658
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
659
|
-
},
|
|
660
|
-
"cycle_letters": {
|
|
661
|
-
"evaluator": "exact_match",
|
|
662
|
-
"extractor_location": "lm_eval_pairs",
|
|
663
|
-
"extractor_file": "cycle_letters",
|
|
664
|
-
"benchmark_type": "other",
|
|
665
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
666
|
-
},
|
|
667
|
-
"darija_bench": {
|
|
668
|
-
"evaluator": "log_likelihoods",
|
|
669
|
-
"extractor_location": "lm_eval_pairs",
|
|
670
|
-
"extractor_file": "darija_bench",
|
|
671
|
-
"benchmark_type": "other",
|
|
672
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
673
|
-
},
|
|
674
|
-
"darijahellaswag": {
|
|
675
|
-
"evaluator": "log_likelihoods",
|
|
676
|
-
"extractor_location": "lm_eval_pairs",
|
|
677
|
-
"extractor_file": "darijahellaswag",
|
|
678
|
-
"benchmark_type": "knowledge",
|
|
679
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
680
|
-
},
|
|
681
|
-
"darijammlu": {
|
|
682
|
-
"evaluator": null,
|
|
683
|
-
"extractor_location": "lm_eval_pairs",
|
|
684
|
-
"extractor_file": "darijammlu",
|
|
685
|
-
"benchmark_type": "knowledge",
|
|
686
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
687
|
-
},
|
|
688
|
-
"dbpedia": {
|
|
689
|
-
"evaluator": "generation",
|
|
690
|
-
"extractor_location": "lm_eval_pairs",
|
|
691
|
-
"extractor_file": "dbpedia",
|
|
692
|
-
"benchmark_type": "other",
|
|
693
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
694
|
-
},
|
|
695
|
-
"discrim_eval": {
|
|
696
|
-
"evaluator": null,
|
|
697
|
-
"extractor_location": "lm_eval_pairs",
|
|
698
|
-
"extractor_file": "discrim_eval",
|
|
699
|
-
"benchmark_type": "other",
|
|
700
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
701
|
-
},
|
|
702
|
-
"doc": {
|
|
703
|
-
"evaluator": null,
|
|
704
|
-
"extractor_location": "lm_eval_pairs",
|
|
705
|
-
"extractor_file": "doc",
|
|
706
|
-
"benchmark_type": "other",
|
|
707
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
708
|
-
},
|
|
709
|
-
"doc_vqa": {
|
|
710
|
-
"evaluator": "generation",
|
|
711
|
-
"extractor_location": "huggingface_pairs",
|
|
712
|
-
"extractor_file": "doc_vqa",
|
|
713
|
-
"benchmark_type": "question_answering",
|
|
714
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
715
|
-
},
|
|
716
|
-
"drop": {
|
|
717
|
-
"evaluator": null,
|
|
718
|
-
"extractor_location": "lm_eval_pairs",
|
|
719
|
-
"extractor_file": "drop",
|
|
720
|
-
"benchmark_type": "other",
|
|
721
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
722
|
-
},
|
|
723
|
-
"ds1000": {
|
|
724
|
-
"evaluator": "exact_match",
|
|
725
|
-
"extractor_location": "huggingface_pairs",
|
|
726
|
-
"extractor_file": "ds1000",
|
|
727
|
-
"benchmark_type": "coding",
|
|
728
|
-
"explanation": "Text comparison (WARNING: should use execution for coding)"
|
|
729
|
-
},
|
|
730
|
-
"ds_1000": {
|
|
731
|
-
"evaluator": null,
|
|
732
|
-
"extractor_location": "huggingface_pairs",
|
|
733
|
-
"extractor_file": "ds_1000",
|
|
734
|
-
"benchmark_type": "other",
|
|
735
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
736
|
-
},
|
|
737
|
-
"egyhellaswag": {
|
|
738
|
-
"evaluator": "log_likelihoods",
|
|
739
|
-
"extractor_location": "lm_eval_pairs",
|
|
740
|
-
"extractor_file": "egyhellaswag",
|
|
741
|
-
"benchmark_type": "knowledge",
|
|
742
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
743
|
-
},
|
|
744
|
-
"egymmlu": {
|
|
745
|
-
"evaluator": "log_likelihoods",
|
|
746
|
-
"extractor_location": "lm_eval_pairs",
|
|
747
|
-
"extractor_file": "egymmlu",
|
|
748
|
-
"benchmark_type": "knowledge",
|
|
749
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
750
|
-
},
|
|
751
|
-
"epec": {
|
|
752
|
-
"evaluator": "log_likelihoods",
|
|
753
|
-
"extractor_location": "lm_eval_pairs",
|
|
754
|
-
"extractor_file": "epec",
|
|
755
|
-
"benchmark_type": "other",
|
|
756
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
757
|
-
},
|
|
758
|
-
"eq": {
|
|
759
|
-
"evaluator": null,
|
|
760
|
-
"extractor_location": "lm_eval_pairs",
|
|
761
|
-
"extractor_file": "eq",
|
|
762
|
-
"benchmark_type": "other",
|
|
763
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
764
|
-
},
|
|
765
|
-
"eq_bench": {
|
|
766
|
-
"evaluator": "exact_match",
|
|
767
|
-
"extractor_location": "lm_eval_pairs",
|
|
768
|
-
"extractor_file": "eq_bench",
|
|
769
|
-
"benchmark_type": "other",
|
|
770
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
771
|
-
},
|
|
772
|
-
"eq_bench_ca": {
|
|
773
|
-
"evaluator": null,
|
|
774
|
-
"extractor_location": "lm_eval_pairs",
|
|
775
|
-
"extractor_file": "eq_bench_ca",
|
|
776
|
-
"benchmark_type": "other",
|
|
777
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
778
|
-
},
|
|
779
|
-
"eq_bench_es": {
|
|
780
|
-
"evaluator": null,
|
|
781
|
-
"extractor_location": "lm_eval_pairs",
|
|
782
|
-
"extractor_file": "eq_bench_es",
|
|
783
|
-
"benchmark_type": "other",
|
|
784
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
785
|
-
},
|
|
786
|
-
"esbbq": {
|
|
787
|
-
"evaluator": null,
|
|
788
|
-
"extractor_location": "lm_eval_pairs",
|
|
789
|
-
"extractor_file": "esbbq",
|
|
790
|
-
"benchmark_type": "other",
|
|
791
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
792
|
-
},
|
|
793
|
-
"escola": {
|
|
794
|
-
"evaluator": "log_likelihoods",
|
|
795
|
-
"extractor_location": "lm_eval_pairs",
|
|
796
|
-
"extractor_file": "escola",
|
|
797
|
-
"benchmark_type": "other",
|
|
798
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
799
|
-
},
|
|
800
|
-
"ethics": {
|
|
801
|
-
"evaluator": "log_likelihoods",
|
|
802
|
-
"extractor_location": "lm_eval_pairs",
|
|
803
|
-
"extractor_file": "ethics",
|
|
804
|
-
"benchmark_type": "other",
|
|
805
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
806
|
-
},
|
|
807
|
-
"ethos": {
|
|
808
|
-
"evaluator": "generation",
|
|
809
|
-
"extractor_location": "lm_eval_pairs",
|
|
810
|
-
"extractor_file": "ethos",
|
|
811
|
-
"benchmark_type": "other",
|
|
812
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
813
|
-
},
|
|
814
|
-
"eus": {
|
|
815
|
-
"evaluator": null,
|
|
816
|
-
"extractor_location": "lm_eval_pairs",
|
|
817
|
-
"extractor_file": "eus",
|
|
818
|
-
"benchmark_type": "other",
|
|
819
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
820
|
-
},
|
|
821
|
-
"eus_exams": {
|
|
822
|
-
"evaluator": "log_likelihoods",
|
|
823
|
-
"extractor_location": "lm_eval_pairs",
|
|
824
|
-
"extractor_file": "eus_exams",
|
|
825
|
-
"benchmark_type": "knowledge",
|
|
826
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
827
|
-
},
|
|
828
|
-
"eus_proficiency": {
|
|
829
|
-
"evaluator": "log_likelihoods",
|
|
830
|
-
"extractor_location": "lm_eval_pairs",
|
|
831
|
-
"extractor_file": "eus_proficiency",
|
|
832
|
-
"benchmark_type": "other",
|
|
833
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
834
|
-
},
|
|
835
|
-
"eus_reading": {
|
|
836
|
-
"evaluator": "log_likelihoods",
|
|
837
|
-
"extractor_location": "lm_eval_pairs",
|
|
838
|
-
"extractor_file": "eus_reading",
|
|
839
|
-
"benchmark_type": "other",
|
|
840
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
841
|
-
},
|
|
842
|
-
"eus_trivia": {
|
|
843
|
-
"evaluator": "log_likelihoods",
|
|
844
|
-
"extractor_location": "lm_eval_pairs",
|
|
845
|
-
"extractor_file": "eus_trivia",
|
|
846
|
-
"benchmark_type": "other",
|
|
847
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
848
|
-
},
|
|
849
|
-
"evalita_llm": {
|
|
850
|
-
"evaluator": "log_likelihoods",
|
|
851
|
-
"extractor_location": "lm_eval_pairs",
|
|
852
|
-
"extractor_file": "evalita_llm",
|
|
853
|
-
"benchmark_type": "other",
|
|
854
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
855
|
-
},
|
|
856
|
-
"evalita_mp": {
|
|
857
|
-
"evaluator": "log_likelihoods",
|
|
858
|
-
"extractor_location": "huggingface_pairs",
|
|
859
|
-
"extractor_file": "evalita_mp",
|
|
860
|
-
"benchmark_type": "other",
|
|
861
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
862
|
-
},
|
|
863
|
-
"evalita_sp": {
|
|
864
|
-
"evaluator": "generation",
|
|
865
|
-
"extractor_location": "lm_eval_pairs",
|
|
866
|
-
"extractor_file": "evalita_sp",
|
|
867
|
-
"benchmark_type": "other",
|
|
868
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
869
|
-
},
|
|
870
|
-
"fda": {
|
|
871
|
-
"evaluator": "generation",
|
|
872
|
-
"extractor_location": "lm_eval_pairs",
|
|
873
|
-
"extractor_file": "fda",
|
|
874
|
-
"benchmark_type": "other",
|
|
875
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
876
|
-
},
|
|
877
|
-
"financial": {
|
|
878
|
-
"evaluator": null,
|
|
879
|
-
"extractor_location": "lm_eval_pairs",
|
|
880
|
-
"extractor_file": "financial",
|
|
881
|
-
"benchmark_type": "other",
|
|
882
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
883
|
-
},
|
|
884
|
-
"financial_tweets": {
|
|
885
|
-
"evaluator": "generation",
|
|
886
|
-
"extractor_location": "huggingface_pairs",
|
|
887
|
-
"extractor_file": "financial_tweets",
|
|
888
|
-
"benchmark_type": "other",
|
|
889
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
890
|
-
},
|
|
891
|
-
"flan": {
|
|
892
|
-
"evaluator": "log_likelihoods",
|
|
893
|
-
"extractor_location": "lm_eval_pairs",
|
|
894
|
-
"extractor_file": "flan",
|
|
895
|
-
"benchmark_type": "other",
|
|
896
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
897
|
-
},
|
|
898
|
-
"fld": {
|
|
899
|
-
"evaluator": "exact_match",
|
|
900
|
-
"extractor_location": "lm_eval_pairs",
|
|
901
|
-
"extractor_file": "fld",
|
|
902
|
-
"benchmark_type": "other",
|
|
903
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
904
|
-
},
|
|
905
|
-
"flores": {
|
|
906
|
-
"evaluator": "generation",
|
|
907
|
-
"extractor_location": "huggingface_pairs",
|
|
908
|
-
"extractor_file": "flores",
|
|
909
|
-
"benchmark_type": "translation",
|
|
910
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
911
|
-
},
|
|
912
|
-
"freebase": {
|
|
913
|
-
"evaluator": "log_likelihoods",
|
|
914
|
-
"extractor_location": "huggingface_pairs",
|
|
915
|
-
"extractor_file": "freebase",
|
|
916
|
-
"benchmark_type": "other",
|
|
917
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
918
|
-
},
|
|
919
|
-
"french_bench": {
|
|
920
|
-
"evaluator": "log_likelihoods",
|
|
921
|
-
"extractor_location": "lm_eval_pairs",
|
|
922
|
-
"extractor_file": "french_bench",
|
|
923
|
-
"benchmark_type": "other",
|
|
924
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
925
|
-
},
|
|
926
|
-
"galcola": {
|
|
927
|
-
"evaluator": "log_likelihoods",
|
|
928
|
-
"extractor_location": "lm_eval_pairs",
|
|
929
|
-
"extractor_file": "galcola",
|
|
930
|
-
"benchmark_type": "other",
|
|
931
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
932
|
-
},
|
|
933
|
-
"galician_bench": {
|
|
934
|
-
"evaluator": "log_likelihoods",
|
|
935
|
-
"extractor_location": "lm_eval_pairs",
|
|
936
|
-
"extractor_file": "galician_bench",
|
|
937
|
-
"benchmark_type": "other",
|
|
938
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
939
|
-
},
|
|
940
|
-
"gaokao": {
|
|
941
|
-
"evaluator": "log_likelihoods",
|
|
942
|
-
"extractor_location": "lm_eval_pairs",
|
|
943
|
-
"extractor_file": "gaokao",
|
|
944
|
-
"benchmark_type": "other",
|
|
945
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
946
|
-
},
|
|
947
|
-
"glianorex": {
|
|
948
|
-
"evaluator": "log_likelihoods",
|
|
949
|
-
"extractor_location": "huggingface_pairs",
|
|
950
|
-
"extractor_file": "glianorex",
|
|
951
|
-
"benchmark_type": "other",
|
|
952
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
953
|
-
},
|
|
954
|
-
"global_mmlu": {
|
|
955
|
-
"evaluator": "log_likelihoods",
|
|
956
|
-
"extractor_location": "lm_eval_pairs",
|
|
957
|
-
"extractor_file": "global_mmlu",
|
|
958
|
-
"benchmark_type": "knowledge",
|
|
959
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
960
|
-
},
|
|
961
|
-
"global_piqa": {
|
|
962
|
-
"evaluator": null,
|
|
963
|
-
"extractor_location": "lm_eval_pairs",
|
|
964
|
-
"extractor_file": "global_piqa",
|
|
965
|
-
"benchmark_type": "question_answering",
|
|
966
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
967
|
-
},
|
|
968
|
-
"glue": {
|
|
969
|
-
"evaluator": "log_likelihoods",
|
|
970
|
-
"extractor_location": "lm_eval_pairs",
|
|
971
|
-
"extractor_file": "glue",
|
|
972
|
-
"benchmark_type": "other",
|
|
973
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
974
|
-
},
|
|
975
|
-
"gpqa": {
|
|
976
|
-
"evaluator": "log_likelihoods",
|
|
977
|
-
"extractor_location": "lm_eval_pairs",
|
|
978
|
-
"extractor_file": "gpqa",
|
|
979
|
-
"benchmark_type": "question_answering",
|
|
980
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
981
|
-
},
|
|
982
|
-
"gpt3": {
|
|
983
|
-
"evaluator": "log_likelihoods",
|
|
984
|
-
"extractor_location": "lm_eval_pairs",
|
|
985
|
-
"extractor_file": "gpt3",
|
|
986
|
-
"benchmark_type": "other",
|
|
987
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
988
|
-
},
|
|
989
|
-
"groundcocoa": {
|
|
990
|
-
"evaluator": "generation",
|
|
991
|
-
"extractor_location": "lm_eval_pairs",
|
|
992
|
-
"extractor_file": "groundcocoa",
|
|
993
|
-
"benchmark_type": "other",
|
|
994
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
995
|
-
},
|
|
996
|
-
"gsm": {
|
|
997
|
-
"evaluator": "exact_match",
|
|
998
|
-
"extractor_location": "lm_eval_pairs",
|
|
999
|
-
"extractor_file": "gsm",
|
|
1000
|
-
"benchmark_type": "mathematics",
|
|
1001
|
-
"explanation": "Text comparison (WARNING: should use execution for mathematics)"
|
|
1002
|
-
},
|
|
1003
|
-
"gsm8k": {
|
|
1004
|
-
"evaluator": "exact_match",
|
|
1005
|
-
"extractor_location": "lm_eval_pairs",
|
|
1006
|
-
"extractor_file": "gsm8k",
|
|
1007
|
-
"benchmark_type": "mathematics",
|
|
1008
|
-
"explanation": "Text comparison (WARNING: should use execution for mathematics)"
|
|
1009
|
-
},
|
|
1010
|
-
"gsm8k_cot": {
|
|
1011
|
-
"evaluator": null,
|
|
1012
|
-
"extractor_location": "huggingface_pairs",
|
|
1013
|
-
"extractor_file": "math",
|
|
1014
|
-
"benchmark_type": "mathematics",
|
|
1015
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1016
|
-
},
|
|
1017
|
-
"gsm8k_cot_llama": {
|
|
1018
|
-
"evaluator": null,
|
|
1019
|
-
"extractor_location": "huggingface_pairs",
|
|
1020
|
-
"extractor_file": "math",
|
|
1021
|
-
"benchmark_type": "mathematics",
|
|
1022
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1023
|
-
},
|
|
1024
|
-
"gsm8k_cot_self_consistency": {
|
|
1025
|
-
"evaluator": null,
|
|
1026
|
-
"extractor_location": "huggingface_pairs",
|
|
1027
|
-
"extractor_file": "math",
|
|
1028
|
-
"benchmark_type": "mathematics",
|
|
1029
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1030
|
-
},
|
|
1031
|
-
"gsm8k_llama": {
|
|
1032
|
-
"evaluator": null,
|
|
1033
|
-
"extractor_location": "huggingface_pairs",
|
|
1034
|
-
"extractor_file": "math",
|
|
1035
|
-
"benchmark_type": "mathematics",
|
|
1036
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1037
|
-
},
|
|
1038
|
-
"gsm8k_platinum_cot": {
|
|
1039
|
-
"evaluator": null,
|
|
1040
|
-
"extractor_location": "huggingface_pairs",
|
|
1041
|
-
"extractor_file": "math",
|
|
1042
|
-
"benchmark_type": "mathematics",
|
|
1043
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1044
|
-
},
|
|
1045
|
-
"gsm8k_platinum_cot_llama": {
|
|
1046
|
-
"evaluator": null,
|
|
1047
|
-
"extractor_location": "huggingface_pairs",
|
|
1048
|
-
"extractor_file": "math",
|
|
1049
|
-
"benchmark_type": "mathematics",
|
|
1050
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1051
|
-
},
|
|
1052
|
-
"gsm8k_platinum_cot_self_consistency": {
|
|
1053
|
-
"evaluator": null,
|
|
1054
|
-
"extractor_location": "huggingface_pairs",
|
|
1055
|
-
"extractor_file": "math",
|
|
1056
|
-
"benchmark_type": "mathematics",
|
|
1057
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1058
|
-
},
|
|
1059
|
-
"gsm_plus": {
|
|
1060
|
-
"evaluator": "exact_match",
|
|
1061
|
-
"extractor_location": "huggingface_pairs",
|
|
1062
|
-
"extractor_file": "gsm_plus",
|
|
1063
|
-
"benchmark_type": "mathematics",
|
|
1064
|
-
"explanation": "Text comparison (WARNING: should use execution for mathematics)"
|
|
1065
|
-
},
|
|
1066
|
-
"haerae": {
|
|
1067
|
-
"evaluator": null,
|
|
1068
|
-
"extractor_location": "lm_eval_pairs",
|
|
1069
|
-
"extractor_file": "haerae",
|
|
1070
|
-
"benchmark_type": "other",
|
|
1071
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1072
|
-
},
|
|
1073
|
-
"headqa": {
|
|
1074
|
-
"evaluator": null,
|
|
1075
|
-
"extractor_location": "lm_eval_pairs",
|
|
1076
|
-
"extractor_file": "headqa",
|
|
1077
|
-
"benchmark_type": "question_answering",
|
|
1078
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1079
|
-
},
|
|
1080
|
-
"hellaswag": {
|
|
1081
|
-
"evaluator": "log_likelihoods",
|
|
1082
|
-
"extractor_location": "lm_eval_pairs",
|
|
1083
|
-
"extractor_file": "hellaswag",
|
|
1084
|
-
"benchmark_type": "knowledge",
|
|
1085
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1086
|
-
},
|
|
1087
|
-
"hendrycks_ethics": {
|
|
1088
|
-
"evaluator": null,
|
|
1089
|
-
"extractor_location": "lm_eval_pairs",
|
|
1090
|
-
"extractor_file": "hendrycks_ethics",
|
|
1091
|
-
"benchmark_type": "other",
|
|
1092
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1093
|
-
},
|
|
1094
|
-
"hendrycks_math": {
|
|
1095
|
-
"evaluator": null,
|
|
1096
|
-
"extractor_location": "lm_eval_pairs",
|
|
1097
|
-
"extractor_file": "hendrycks_math",
|
|
1098
|
-
"benchmark_type": "mathematics",
|
|
1099
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1100
|
-
},
|
|
1101
|
-
"histoires_morales": {
|
|
1102
|
-
"evaluator": "generation",
|
|
1103
|
-
"extractor_location": "lm_eval_pairs",
|
|
1104
|
-
"extractor_file": "histoires_morales",
|
|
1105
|
-
"benchmark_type": "other",
|
|
1106
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1107
|
-
},
|
|
1108
|
-
"hle": {
|
|
1109
|
-
"evaluator": null,
|
|
1110
|
-
"extractor_location": "huggingface_pairs",
|
|
1111
|
-
"extractor_file": "hle",
|
|
1112
|
-
"benchmark_type": "other",
|
|
1113
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1114
|
-
},
|
|
1115
|
-
"hle_exact_match": {
|
|
1116
|
-
"evaluator": null,
|
|
1117
|
-
"extractor_location": "huggingface_pairs",
|
|
1118
|
-
"extractor_file": "hle",
|
|
1119
|
-
"benchmark_type": "other",
|
|
1120
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1121
|
-
},
|
|
1122
|
-
"hle_multiple_choice": {
|
|
1123
|
-
"evaluator": null,
|
|
1124
|
-
"extractor_location": "huggingface_pairs",
|
|
1125
|
-
"extractor_file": "hle",
|
|
1126
|
-
"benchmark_type": "other",
|
|
1127
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1128
|
-
},
|
|
1129
|
-
"hmmt": {
|
|
1130
|
-
"evaluator": null,
|
|
1131
|
-
"extractor_location": "huggingface_pairs",
|
|
1132
|
-
"extractor_file": "hmmt",
|
|
1133
|
-
"benchmark_type": "mathematics",
|
|
1134
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1135
|
-
},
|
|
1136
|
-
"hmmt_feb_2025": {
|
|
1137
|
-
"evaluator": null,
|
|
1138
|
-
"extractor_location": "huggingface_pairs",
|
|
1139
|
-
"extractor_file": "hmmt",
|
|
1140
|
-
"benchmark_type": "mathematics",
|
|
1141
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1142
|
-
},
|
|
1143
|
-
"hrm8k": {
|
|
1144
|
-
"evaluator": "exact_match",
|
|
1145
|
-
"extractor_location": "lm_eval_pairs",
|
|
1146
|
-
"extractor_file": "hrm8k",
|
|
1147
|
-
"benchmark_type": "other",
|
|
1148
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
1149
|
-
},
|
|
1150
|
-
"humaneval": {
|
|
1151
|
-
"evaluator": null,
|
|
1152
|
-
"extractor_location": "huggingface_pairs",
|
|
1153
|
-
"extractor_file": "humaneval",
|
|
1154
|
-
"benchmark_type": "coding",
|
|
1155
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1156
|
-
},
|
|
1157
|
-
"humaneval_64_instruct": {
|
|
1158
|
-
"evaluator": null,
|
|
1159
|
-
"extractor_location": "huggingface_pairs",
|
|
1160
|
-
"extractor_file": "instructhumaneval",
|
|
1161
|
-
"benchmark_type": "coding",
|
|
1162
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1163
|
-
},
|
|
1164
|
-
"humaneval_infilling": {
|
|
1165
|
-
"evaluator": null,
|
|
1166
|
-
"extractor_location": "lm_eval_pairs",
|
|
1167
|
-
"extractor_file": "humaneval_infilling",
|
|
1168
|
-
"benchmark_type": "coding",
|
|
1169
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1170
|
-
},
|
|
1171
|
-
"humaneval_instruct": {
|
|
1172
|
-
"evaluator": null,
|
|
1173
|
-
"extractor_location": "huggingface_pairs",
|
|
1174
|
-
"extractor_file": "instructhumaneval",
|
|
1175
|
-
"benchmark_type": "coding",
|
|
1176
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1177
|
-
},
|
|
1178
|
-
"humaneval_plus": {
|
|
1179
|
-
"evaluator": null,
|
|
1180
|
-
"extractor_location": "huggingface_pairs",
|
|
1181
|
-
"extractor_file": "humaneval",
|
|
1182
|
-
"benchmark_type": "coding",
|
|
1183
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1184
|
-
},
|
|
1185
|
-
"humanevalpack": {
|
|
1186
|
-
"evaluator": "exact_match",
|
|
1187
|
-
"extractor_location": "huggingface_pairs",
|
|
1188
|
-
"extractor_file": "humanevalpack",
|
|
1189
|
-
"benchmark_type": "coding",
|
|
1190
|
-
"explanation": "Text comparison (WARNING: should use execution for coding)"
|
|
1191
|
-
},
|
|
1192
|
-
"icelandic_winogrande": {
|
|
1193
|
-
"evaluator": null,
|
|
1194
|
-
"extractor_location": "lm_eval_pairs",
|
|
1195
|
-
"extractor_file": "icelandic_winogrande",
|
|
1196
|
-
"benchmark_type": "other",
|
|
1197
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1198
|
-
},
|
|
1199
|
-
"ifeval": {
|
|
1200
|
-
"evaluator": "exact_match",
|
|
1201
|
-
"extractor_location": "lm_eval_pairs",
|
|
1202
|
-
"extractor_file": "ifeval",
|
|
1203
|
-
"benchmark_type": "other",
|
|
1204
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
1205
|
-
},
|
|
1206
|
-
"instruct_humaneval": {
|
|
1207
|
-
"evaluator": null,
|
|
1208
|
-
"extractor_location": "huggingface_pairs",
|
|
1209
|
-
"extractor_file": "instructhumaneval",
|
|
1210
|
-
"benchmark_type": "coding",
|
|
1211
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1212
|
-
},
|
|
1213
|
-
"instructhumaneval": {
|
|
1214
|
-
"evaluator": null,
|
|
1215
|
-
"extractor_location": "huggingface_pairs",
|
|
1216
|
-
"extractor_file": "instructhumaneval",
|
|
1217
|
-
"benchmark_type": "coding",
|
|
1218
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1219
|
-
},
|
|
1220
|
-
"inverse": {
|
|
1221
|
-
"evaluator": null,
|
|
1222
|
-
"extractor_location": "lm_eval_pairs",
|
|
1223
|
-
"extractor_file": "inverse",
|
|
1224
|
-
"benchmark_type": "other",
|
|
1225
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1226
|
-
},
|
|
1227
|
-
"inverse_scaling": {
|
|
1228
|
-
"evaluator": "log_likelihoods",
|
|
1229
|
-
"extractor_location": "lm_eval_pairs",
|
|
1230
|
-
"extractor_file": "inverse_scaling",
|
|
1231
|
-
"benchmark_type": "other",
|
|
1232
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1233
|
-
},
|
|
1234
|
-
"iwslt2017": {
|
|
1235
|
-
"evaluator": "generation",
|
|
1236
|
-
"extractor_location": "lm_eval_pairs",
|
|
1237
|
-
"extractor_file": "iwslt2017",
|
|
1238
|
-
"benchmark_type": "translation",
|
|
1239
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1240
|
-
},
|
|
1241
|
-
"iwslt2017_ar_en": {
|
|
1242
|
-
"evaluator": "generation",
|
|
1243
|
-
"extractor_location": "huggingface_pairs",
|
|
1244
|
-
"extractor_file": "iwslt2017_ar_en",
|
|
1245
|
-
"benchmark_type": "translation",
|
|
1246
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1247
|
-
},
|
|
1248
|
-
"iwslt2017_en_ar": {
|
|
1249
|
-
"evaluator": "generation",
|
|
1250
|
-
"extractor_location": "huggingface_pairs",
|
|
1251
|
-
"extractor_file": "iwslt2017_en_ar",
|
|
1252
|
-
"benchmark_type": "translation",
|
|
1253
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1254
|
-
},
|
|
1255
|
-
"ja": {
|
|
1256
|
-
"evaluator": null,
|
|
1257
|
-
"extractor_location": "lm_eval_pairs",
|
|
1258
|
-
"extractor_file": "ja",
|
|
1259
|
-
"benchmark_type": "other",
|
|
1260
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1261
|
-
},
|
|
1262
|
-
"japanese_leaderboard": {
|
|
1263
|
-
"evaluator": "log_likelihoods",
|
|
1264
|
-
"extractor_location": "lm_eval_pairs",
|
|
1265
|
-
"extractor_file": "japanese_leaderboard",
|
|
1266
|
-
"benchmark_type": "other",
|
|
1267
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1268
|
-
},
|
|
1269
|
-
"jsonschema_bench": {
|
|
1270
|
-
"evaluator": "generation",
|
|
1271
|
-
"extractor_location": "lm_eval_pairs",
|
|
1272
|
-
"extractor_file": "jsonschema_bench",
|
|
1273
|
-
"benchmark_type": "other",
|
|
1274
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1275
|
-
},
|
|
1276
|
-
"kbl": {
|
|
1277
|
-
"evaluator": "log_likelihoods",
|
|
1278
|
-
"extractor_location": "lm_eval_pairs",
|
|
1279
|
-
"extractor_file": "kbl",
|
|
1280
|
-
"benchmark_type": "other",
|
|
1281
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1282
|
-
},
|
|
1283
|
-
"kmmlu": {
|
|
1284
|
-
"evaluator": "log_likelihoods",
|
|
1285
|
-
"extractor_location": "lm_eval_pairs",
|
|
1286
|
-
"extractor_file": "kmmlu",
|
|
1287
|
-
"benchmark_type": "knowledge",
|
|
1288
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1289
|
-
},
|
|
1290
|
-
"kobest": {
|
|
1291
|
-
"evaluator": null,
|
|
1292
|
-
"extractor_location": "lm_eval_pairs",
|
|
1293
|
-
"extractor_file": "kobest",
|
|
1294
|
-
"benchmark_type": "other",
|
|
1295
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1296
|
-
},
|
|
1297
|
-
"kormedmcqa": {
|
|
1298
|
-
"evaluator": "generation",
|
|
1299
|
-
"extractor_location": "lm_eval_pairs",
|
|
1300
|
-
"extractor_file": "kormedmcqa",
|
|
1301
|
-
"benchmark_type": "question_answering",
|
|
1302
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1303
|
-
},
|
|
1304
|
-
"lambada": {
|
|
1305
|
-
"evaluator": "exact_match",
|
|
1306
|
-
"extractor_location": "lm_eval_pairs",
|
|
1307
|
-
"extractor_file": "lambada",
|
|
1308
|
-
"benchmark_type": "other",
|
|
1309
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
1310
|
-
},
|
|
1311
|
-
"lambada_cloze": {
|
|
1312
|
-
"evaluator": null,
|
|
1313
|
-
"extractor_location": "lm_eval_pairs",
|
|
1314
|
-
"extractor_file": "lambada_cloze",
|
|
1315
|
-
"benchmark_type": "other",
|
|
1316
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1317
|
-
},
|
|
1318
|
-
"lambada_multilingual": {
|
|
1319
|
-
"evaluator": null,
|
|
1320
|
-
"extractor_location": "lm_eval_pairs",
|
|
1321
|
-
"extractor_file": "lambada_multilingual",
|
|
1322
|
-
"benchmark_type": "other",
|
|
1323
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1324
|
-
},
|
|
1325
|
-
"lambada_multilingual_stablelm": {
|
|
1326
|
-
"evaluator": "log_likelihoods",
|
|
1327
|
-
"extractor_location": "lm_eval_pairs",
|
|
1328
|
-
"extractor_file": "lambada_multilingual_stablelm",
|
|
1329
|
-
"benchmark_type": "other",
|
|
1330
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1331
|
-
},
|
|
1332
|
-
"law": {
|
|
1333
|
-
"evaluator": null,
|
|
1334
|
-
"extractor_location": "lm_eval_pairs",
|
|
1335
|
-
"extractor_file": "law",
|
|
1336
|
-
"benchmark_type": "other",
|
|
1337
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1338
|
-
},
|
|
1339
|
-
"law_stack_exchange": {
|
|
1340
|
-
"evaluator": "generation",
|
|
1341
|
-
"extractor_location": "huggingface_pairs",
|
|
1342
|
-
"extractor_file": "law_stack_exchange",
|
|
1343
|
-
"benchmark_type": "other",
|
|
1344
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1345
|
-
},
|
|
1346
|
-
"leaderboard": {
|
|
1347
|
-
"evaluator": null,
|
|
1348
|
-
"extractor_location": "lm_eval_pairs",
|
|
1349
|
-
"extractor_file": "leaderboard",
|
|
1350
|
-
"benchmark_type": "other",
|
|
1351
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1352
|
-
},
|
|
1353
|
-
"ledgar": {
|
|
1354
|
-
"evaluator": null,
|
|
1355
|
-
"extractor_location": "huggingface_pairs",
|
|
1356
|
-
"extractor_file": "ledgar",
|
|
1357
|
-
"benchmark_type": "other",
|
|
1358
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1359
|
-
},
|
|
1360
|
-
"libra": {
|
|
1361
|
-
"evaluator": "generation",
|
|
1362
|
-
"extractor_location": "lm_eval_pairs",
|
|
1363
|
-
"extractor_file": "libra",
|
|
1364
|
-
"benchmark_type": "other",
|
|
1365
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1366
|
-
},
|
|
1367
|
-
"lingoly": {
|
|
1368
|
-
"evaluator": "log_likelihoods",
|
|
1369
|
-
"extractor_location": "lm_eval_pairs",
|
|
1370
|
-
"extractor_file": "lingoly",
|
|
1371
|
-
"benchmark_type": "other",
|
|
1372
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1373
|
-
},
|
|
1374
|
-
"livecodebench": {
|
|
1375
|
-
"evaluator": null,
|
|
1376
|
-
"extractor_location": "huggingface_pairs",
|
|
1377
|
-
"extractor_file": "livecodebench",
|
|
1378
|
-
"benchmark_type": "coding",
|
|
1379
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1380
|
-
},
|
|
1381
|
-
"livemathbench": {
|
|
1382
|
-
"evaluator": null,
|
|
1383
|
-
"extractor_location": "huggingface_pairs",
|
|
1384
|
-
"extractor_file": "livemathbench",
|
|
1385
|
-
"benchmark_type": "mathematics",
|
|
1386
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1387
|
-
},
|
|
1388
|
-
"livemathbench_cnmo_en": {
|
|
1389
|
-
"evaluator": null,
|
|
1390
|
-
"extractor_location": "huggingface_pairs",
|
|
1391
|
-
"extractor_file": "livemathbench_configs",
|
|
1392
|
-
"benchmark_type": "mathematics",
|
|
1393
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1394
|
-
},
|
|
1395
|
-
"livemathbench_cnmo_zh": {
|
|
1396
|
-
"evaluator": null,
|
|
1397
|
-
"extractor_location": "huggingface_pairs",
|
|
1398
|
-
"extractor_file": "livemathbench_configs",
|
|
1399
|
-
"benchmark_type": "mathematics",
|
|
1400
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1401
|
-
},
|
|
1402
|
-
"llama": {
|
|
1403
|
-
"evaluator": null,
|
|
1404
|
-
"extractor_location": "huggingface_pairs",
|
|
1405
|
-
"extractor_file": "llama",
|
|
1406
|
-
"benchmark_type": "other",
|
|
1407
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1408
|
-
},
|
|
1409
|
-
"llama3": {
|
|
1410
|
-
"evaluator": null,
|
|
1411
|
-
"extractor_location": "lm_eval_pairs",
|
|
1412
|
-
"extractor_file": "llama3",
|
|
1413
|
-
"benchmark_type": "other",
|
|
1414
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1415
|
-
},
|
|
1416
|
-
"lm_syneval": {
|
|
1417
|
-
"evaluator": null,
|
|
1418
|
-
"extractor_location": "lm_eval_pairs",
|
|
1419
|
-
"extractor_file": "lm_syneval",
|
|
1420
|
-
"benchmark_type": "other",
|
|
1421
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1422
|
-
},
|
|
1423
|
-
"logieval": {
|
|
1424
|
-
"evaluator": null,
|
|
1425
|
-
"extractor_location": "huggingface_pairs",
|
|
1426
|
-
"extractor_file": "logieval",
|
|
1427
|
-
"benchmark_type": "other",
|
|
1428
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1429
|
-
},
|
|
1430
|
-
"logiqa": {
|
|
1431
|
-
"evaluator": null,
|
|
1432
|
-
"extractor_location": "lm_eval_pairs",
|
|
1433
|
-
"extractor_file": "logiqa",
|
|
1434
|
-
"benchmark_type": "question_answering",
|
|
1435
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1436
|
-
},
|
|
1437
|
-
"logiqa2": {
|
|
1438
|
-
"evaluator": null,
|
|
1439
|
-
"extractor_location": "lm_eval_pairs",
|
|
1440
|
-
"extractor_file": "logiqa2",
|
|
1441
|
-
"benchmark_type": "question_answering",
|
|
1442
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1443
|
-
},
|
|
1444
|
-
"longbench": {
|
|
1445
|
-
"evaluator": null,
|
|
1446
|
-
"extractor_location": "lm_eval_pairs",
|
|
1447
|
-
"extractor_file": "longbench",
|
|
1448
|
-
"benchmark_type": "other",
|
|
1449
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1450
|
-
},
|
|
1451
|
-
"longbenchv2": {
|
|
1452
|
-
"evaluator": null,
|
|
1453
|
-
"extractor_location": "lm_eval_pairs",
|
|
1454
|
-
"extractor_file": "longbenchv2",
|
|
1455
|
-
"benchmark_type": "other",
|
|
1456
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1457
|
-
},
|
|
1458
|
-
"m_mmlu": {
|
|
1459
|
-
"evaluator": null,
|
|
1460
|
-
"extractor_location": "huggingface_pairs",
|
|
1461
|
-
"extractor_file": "m_mmlu",
|
|
1462
|
-
"benchmark_type": "knowledge",
|
|
1463
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1464
|
-
},
|
|
1465
|
-
"mastermind": {
|
|
1466
|
-
"evaluator": "log_likelihoods",
|
|
1467
|
-
"extractor_location": "lm_eval_pairs",
|
|
1468
|
-
"extractor_file": "mastermind",
|
|
1469
|
-
"benchmark_type": "other",
|
|
1470
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1471
|
-
},
|
|
1472
|
-
"math": {
|
|
1473
|
-
"evaluator": null,
|
|
1474
|
-
"extractor_location": "huggingface_pairs",
|
|
1475
|
-
"extractor_file": "math",
|
|
1476
|
-
"benchmark_type": "mathematics",
|
|
1477
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1478
|
-
},
|
|
1479
|
-
"math500": {
|
|
1480
|
-
"evaluator": null,
|
|
1481
|
-
"extractor_location": "huggingface_pairs",
|
|
1482
|
-
"extractor_file": "math",
|
|
1483
|
-
"benchmark_type": "mathematics",
|
|
1484
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1485
|
-
},
|
|
1486
|
-
"math_500": {
|
|
1487
|
-
"evaluator": null,
|
|
1488
|
-
"extractor_location": "huggingface_pairs",
|
|
1489
|
-
"extractor_file": "math",
|
|
1490
|
-
"benchmark_type": "mathematics",
|
|
1491
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1492
|
-
},
|
|
1493
|
-
"mathqa": {
|
|
1494
|
-
"evaluator": null,
|
|
1495
|
-
"extractor_location": "lm_eval_pairs",
|
|
1496
|
-
"extractor_file": "mathqa",
|
|
1497
|
-
"benchmark_type": "mathematics",
|
|
1498
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1499
|
-
},
|
|
1500
|
-
"mbpp": {
|
|
1501
|
-
"evaluator": null,
|
|
1502
|
-
"extractor_location": "huggingface_pairs",
|
|
1503
|
-
"extractor_file": "mbpp",
|
|
1504
|
-
"benchmark_type": "coding",
|
|
1505
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1506
|
-
},
|
|
1507
|
-
"mbpp_plus": {
|
|
1508
|
-
"evaluator": null,
|
|
1509
|
-
"extractor_location": "huggingface_pairs",
|
|
1510
|
-
"extractor_file": "mbpp",
|
|
1511
|
-
"benchmark_type": "coding",
|
|
1512
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1513
|
-
},
|
|
1514
|
-
"mc-taco": {
|
|
1515
|
-
"evaluator": null,
|
|
1516
|
-
"extractor_location": "lm_eval_pairs",
|
|
1517
|
-
"extractor_file": "mc-taco",
|
|
1518
|
-
"benchmark_type": "other",
|
|
1519
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1520
|
-
},
|
|
1521
|
-
"med_concepts_qa": {
|
|
1522
|
-
"evaluator": "log_likelihoods",
|
|
1523
|
-
"extractor_location": "lm_eval_pairs",
|
|
1524
|
-
"extractor_file": "med_concepts_qa",
|
|
1525
|
-
"benchmark_type": "question_answering",
|
|
1526
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1527
|
-
},
|
|
1528
|
-
"meddialog": {
|
|
1529
|
-
"evaluator": null,
|
|
1530
|
-
"extractor_location": "huggingface_pairs",
|
|
1531
|
-
"extractor_file": "meddialog",
|
|
1532
|
-
"benchmark_type": "other",
|
|
1533
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1534
|
-
},
|
|
1535
|
-
"meddialog_qsumm": {
|
|
1536
|
-
"evaluator": null,
|
|
1537
|
-
"extractor_location": "huggingface_pairs",
|
|
1538
|
-
"extractor_file": "meddialog",
|
|
1539
|
-
"benchmark_type": "other",
|
|
1540
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1541
|
-
},
|
|
1542
|
-
"meddialog_qsumm_perplexity": {
|
|
1543
|
-
"evaluator": null,
|
|
1544
|
-
"extractor_location": "huggingface_pairs",
|
|
1545
|
-
"extractor_file": "meddialog",
|
|
1546
|
-
"benchmark_type": "other",
|
|
1547
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1548
|
-
},
|
|
1549
|
-
"meddialog_raw_dialogues": {
|
|
1550
|
-
"evaluator": null,
|
|
1551
|
-
"extractor_location": "huggingface_pairs",
|
|
1552
|
-
"extractor_file": "meddialog",
|
|
1553
|
-
"benchmark_type": "other",
|
|
1554
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1555
|
-
},
|
|
1556
|
-
"meddialog_raw_perplexity": {
|
|
1557
|
-
"evaluator": null,
|
|
1558
|
-
"extractor_location": "huggingface_pairs",
|
|
1559
|
-
"extractor_file": "meddialog",
|
|
1560
|
-
"benchmark_type": "other",
|
|
1561
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1562
|
-
},
|
|
1563
|
-
"medical": {
|
|
1564
|
-
"evaluator": null,
|
|
1565
|
-
"extractor_location": "lm_eval_pairs",
|
|
1566
|
-
"extractor_file": "medical",
|
|
1567
|
-
"benchmark_type": "other",
|
|
1568
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1569
|
-
},
|
|
1570
|
-
"medical_abstracts": {
|
|
1571
|
-
"evaluator": "generation",
|
|
1572
|
-
"extractor_location": "huggingface_pairs",
|
|
1573
|
-
"extractor_file": "medical_abstracts",
|
|
1574
|
-
"benchmark_type": "other",
|
|
1575
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1576
|
-
},
|
|
1577
|
-
"mediqa_qa2019": {
|
|
1578
|
-
"evaluator": "generation",
|
|
1579
|
-
"extractor_location": "lm_eval_pairs",
|
|
1580
|
-
"extractor_file": "mediqa_qa2019",
|
|
1581
|
-
"benchmark_type": "question_answering",
|
|
1582
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1583
|
-
},
|
|
1584
|
-
"medmcqa": {
|
|
1585
|
-
"evaluator": "log_likelihoods",
|
|
1586
|
-
"extractor_location": "lm_eval_pairs",
|
|
1587
|
-
"extractor_file": "medmcqa",
|
|
1588
|
-
"benchmark_type": "question_answering",
|
|
1589
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1590
|
-
},
|
|
1591
|
-
"medqa": {
|
|
1592
|
-
"evaluator": null,
|
|
1593
|
-
"extractor_location": "lm_eval_pairs",
|
|
1594
|
-
"extractor_file": "medqa",
|
|
1595
|
-
"benchmark_type": "question_answering",
|
|
1596
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1597
|
-
},
|
|
1598
|
-
"medtext": {
|
|
1599
|
-
"evaluator": "generation",
|
|
1600
|
-
"extractor_location": "lm_eval_pairs",
|
|
1601
|
-
"extractor_file": "medtext",
|
|
1602
|
-
"benchmark_type": "other",
|
|
1603
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1604
|
-
},
|
|
1605
|
-
"mela": {
|
|
1606
|
-
"evaluator": "log_likelihoods",
|
|
1607
|
-
"extractor_location": "huggingface_pairs",
|
|
1608
|
-
"extractor_file": "mela",
|
|
1609
|
-
"benchmark_type": "other",
|
|
1610
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1611
|
-
},
|
|
1612
|
-
"meqsum": {
|
|
1613
|
-
"evaluator": "generation",
|
|
1614
|
-
"extractor_location": "lm_eval_pairs",
|
|
1615
|
-
"extractor_file": "meqsum",
|
|
1616
|
-
"benchmark_type": "other",
|
|
1617
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1618
|
-
},
|
|
1619
|
-
"mercury": {
|
|
1620
|
-
"evaluator": null,
|
|
1621
|
-
"extractor_location": "huggingface_pairs",
|
|
1622
|
-
"extractor_file": "mercury",
|
|
1623
|
-
"benchmark_type": "other",
|
|
1624
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1625
|
-
},
|
|
1626
|
-
"metabench": {
|
|
1627
|
-
"evaluator": null,
|
|
1628
|
-
"extractor_location": "lm_eval_pairs",
|
|
1629
|
-
"extractor_file": "metabench",
|
|
1630
|
-
"benchmark_type": "other",
|
|
1631
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1632
|
-
},
|
|
1633
|
-
"mgsm": {
|
|
1634
|
-
"evaluator": "generation",
|
|
1635
|
-
"extractor_location": "lm_eval_pairs",
|
|
1636
|
-
"extractor_file": "mgsm",
|
|
1637
|
-
"benchmark_type": "mathematics",
|
|
1638
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1639
|
-
},
|
|
1640
|
-
"mimic_repsum": {
|
|
1641
|
-
"evaluator": "generation",
|
|
1642
|
-
"extractor_location": "lm_eval_pairs",
|
|
1643
|
-
"extractor_file": "mimic_repsum",
|
|
1644
|
-
"benchmark_type": "other",
|
|
1645
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1646
|
-
},
|
|
1647
|
-
"minerva_math": {
|
|
1648
|
-
"evaluator": "generation",
|
|
1649
|
-
"extractor_location": "lm_eval_pairs",
|
|
1650
|
-
"extractor_file": "minerva_math",
|
|
1651
|
-
"benchmark_type": "mathematics",
|
|
1652
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1653
|
-
},
|
|
1654
|
-
"mlqa": {
|
|
1655
|
-
"evaluator": "generation",
|
|
1656
|
-
"extractor_location": "lm_eval_pairs",
|
|
1657
|
-
"extractor_file": "mlqa",
|
|
1658
|
-
"benchmark_type": "question_answering",
|
|
1659
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1660
|
-
},
|
|
1661
|
-
"mmlu": {
|
|
1662
|
-
"evaluator": "log_likelihoods",
|
|
1663
|
-
"extractor_location": "lm_eval_pairs",
|
|
1664
|
-
"extractor_file": "mmlu",
|
|
1665
|
-
"benchmark_type": "knowledge",
|
|
1666
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1667
|
-
},
|
|
1668
|
-
"mmlu_pro": {
|
|
1669
|
-
"evaluator": null,
|
|
1670
|
-
"extractor_location": "lm_eval_pairs",
|
|
1671
|
-
"extractor_file": "mmlu_pro",
|
|
1672
|
-
"benchmark_type": "knowledge",
|
|
1673
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1674
|
-
},
|
|
1675
|
-
"mmlusr": {
|
|
1676
|
-
"evaluator": null,
|
|
1677
|
-
"extractor_location": "huggingface_pairs",
|
|
1678
|
-
"extractor_file": "mmlusr",
|
|
1679
|
-
"benchmark_type": "knowledge",
|
|
1680
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1681
|
-
},
|
|
1682
|
-
"mmlusr_answer_only": {
|
|
1683
|
-
"evaluator": null,
|
|
1684
|
-
"extractor_location": "huggingface_pairs",
|
|
1685
|
-
"extractor_file": "mmlusr",
|
|
1686
|
-
"benchmark_type": "knowledge",
|
|
1687
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1688
|
-
},
|
|
1689
|
-
"mmlusr_question_and_answer": {
|
|
1690
|
-
"evaluator": null,
|
|
1691
|
-
"extractor_location": "huggingface_pairs",
|
|
1692
|
-
"extractor_file": "mmlusr",
|
|
1693
|
-
"benchmark_type": "knowledge",
|
|
1694
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1695
|
-
},
|
|
1696
|
-
"mmlusr_question_only": {
|
|
1697
|
-
"evaluator": null,
|
|
1698
|
-
"extractor_location": "huggingface_pairs",
|
|
1699
|
-
"extractor_file": "mmlusr",
|
|
1700
|
-
"benchmark_type": "knowledge",
|
|
1701
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1702
|
-
},
|
|
1703
|
-
"mmmu": {
|
|
1704
|
-
"evaluator": "log_likelihoods",
|
|
1705
|
-
"extractor_location": "lm_eval_pairs",
|
|
1706
|
-
"extractor_file": "mmmu",
|
|
1707
|
-
"benchmark_type": "other",
|
|
1708
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1709
|
-
},
|
|
1710
|
-
"mnli": {
|
|
1711
|
-
"evaluator": "log_likelihoods",
|
|
1712
|
-
"extractor_location": "lm_eval_pairs",
|
|
1713
|
-
"extractor_file": "mnli",
|
|
1714
|
-
"benchmark_type": "other",
|
|
1715
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1716
|
-
},
|
|
1717
|
-
"model_written_evals": {
|
|
1718
|
-
"evaluator": "log_likelihoods",
|
|
1719
|
-
"extractor_location": "lm_eval_pairs",
|
|
1720
|
-
"extractor_file": "model_written_evals",
|
|
1721
|
-
"benchmark_type": "other",
|
|
1722
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1723
|
-
},
|
|
1724
|
-
"moral_stories": {
|
|
1725
|
-
"evaluator": "log_likelihoods",
|
|
1726
|
-
"extractor_location": "lm_eval_pairs",
|
|
1727
|
-
"extractor_file": "moral_stories",
|
|
1728
|
-
"benchmark_type": "other",
|
|
1729
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1730
|
-
},
|
|
1731
|
-
"mrpc": {
|
|
1732
|
-
"evaluator": null,
|
|
1733
|
-
"extractor_location": "lm_eval_pairs",
|
|
1734
|
-
"extractor_file": "mrpc",
|
|
1735
|
-
"benchmark_type": "other",
|
|
1736
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1737
|
-
},
|
|
1738
|
-
"mts_dialog": {
|
|
1739
|
-
"evaluator": "generation",
|
|
1740
|
-
"extractor_location": "lm_eval_pairs",
|
|
1741
|
-
"extractor_file": "mts_dialog",
|
|
1742
|
-
"benchmark_type": "other",
|
|
1743
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1744
|
-
},
|
|
1745
|
-
"multiblimp": {
|
|
1746
|
-
"evaluator": "log_likelihoods",
|
|
1747
|
-
"extractor_location": "lm_eval_pairs",
|
|
1748
|
-
"extractor_file": "multiblimp",
|
|
1749
|
-
"benchmark_type": "other",
|
|
1750
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1751
|
-
},
|
|
1752
|
-
"multilingual": {
|
|
1753
|
-
"evaluator": null,
|
|
1754
|
-
"extractor_location": "lm_eval_pairs",
|
|
1755
|
-
"extractor_file": "multilingual",
|
|
1756
|
-
"benchmark_type": "other",
|
|
1757
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1758
|
-
},
|
|
1759
|
-
"multimedqa": {
|
|
1760
|
-
"evaluator": "log_likelihoods",
|
|
1761
|
-
"extractor_location": "huggingface_pairs",
|
|
1762
|
-
"extractor_file": "multimedqa",
|
|
1763
|
-
"benchmark_type": "question_answering",
|
|
1764
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1765
|
-
},
|
|
1766
|
-
"multipl_e": {
|
|
1767
|
-
"evaluator": null,
|
|
1768
|
-
"extractor_location": "huggingface_pairs",
|
|
1769
|
-
"extractor_file": "multipl_e",
|
|
1770
|
-
"benchmark_type": "other",
|
|
1771
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1772
|
-
},
|
|
1773
|
-
"multiple": {
|
|
1774
|
-
"evaluator": null,
|
|
1775
|
-
"extractor_location": "huggingface_pairs",
|
|
1776
|
-
"extractor_file": "multiple",
|
|
1777
|
-
"benchmark_type": "other",
|
|
1778
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1779
|
-
},
|
|
1780
|
-
"multiple_cpp": {
|
|
1781
|
-
"evaluator": null,
|
|
1782
|
-
"extractor_location": "huggingface_pairs",
|
|
1783
|
-
"extractor_file": "multipl_e",
|
|
1784
|
-
"benchmark_type": "other",
|
|
1785
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1786
|
-
},
|
|
1787
|
-
"multiple_go": {
|
|
1788
|
-
"evaluator": null,
|
|
1789
|
-
"extractor_location": "huggingface_pairs",
|
|
1790
|
-
"extractor_file": "multipl_e",
|
|
1791
|
-
"benchmark_type": "other",
|
|
1792
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1793
|
-
},
|
|
1794
|
-
"multiple_java": {
|
|
1795
|
-
"evaluator": null,
|
|
1796
|
-
"extractor_location": "huggingface_pairs",
|
|
1797
|
-
"extractor_file": "multipl_e",
|
|
1798
|
-
"benchmark_type": "other",
|
|
1799
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1800
|
-
},
|
|
1801
|
-
"multiple_js": {
|
|
1802
|
-
"evaluator": null,
|
|
1803
|
-
"extractor_location": "huggingface_pairs",
|
|
1804
|
-
"extractor_file": "multipl_e",
|
|
1805
|
-
"benchmark_type": "other",
|
|
1806
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1807
|
-
},
|
|
1808
|
-
"multiple_py": {
|
|
1809
|
-
"evaluator": null,
|
|
1810
|
-
"extractor_location": "huggingface_pairs",
|
|
1811
|
-
"extractor_file": "multipl_e",
|
|
1812
|
-
"benchmark_type": "other",
|
|
1813
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1814
|
-
},
|
|
1815
|
-
"multiple_rs": {
|
|
1816
|
-
"evaluator": null,
|
|
1817
|
-
"extractor_location": "huggingface_pairs",
|
|
1818
|
-
"extractor_file": "multipl_e",
|
|
1819
|
-
"benchmark_type": "other",
|
|
1820
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1821
|
-
},
|
|
1822
|
-
"multirc": {
|
|
1823
|
-
"evaluator": null,
|
|
1824
|
-
"extractor_location": "lm_eval_pairs",
|
|
1825
|
-
"extractor_file": "multirc",
|
|
1826
|
-
"benchmark_type": "other",
|
|
1827
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1828
|
-
},
|
|
1829
|
-
"mutual": {
|
|
1830
|
-
"evaluator": null,
|
|
1831
|
-
"extractor_location": "lm_eval_pairs",
|
|
1832
|
-
"extractor_file": "mutual",
|
|
1833
|
-
"benchmark_type": "other",
|
|
1834
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1835
|
-
},
|
|
1836
|
-
"non": {
|
|
1837
|
-
"evaluator": null,
|
|
1838
|
-
"extractor_location": "lm_eval_pairs",
|
|
1839
|
-
"extractor_file": "non",
|
|
1840
|
-
"benchmark_type": "other",
|
|
1841
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1842
|
-
},
|
|
1843
|
-
"noreval": {
|
|
1844
|
-
"evaluator": "log_likelihoods",
|
|
1845
|
-
"extractor_location": "lm_eval_pairs",
|
|
1846
|
-
"extractor_file": "noreval",
|
|
1847
|
-
"benchmark_type": "other",
|
|
1848
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1849
|
-
},
|
|
1850
|
-
"noreval_gen": {
|
|
1851
|
-
"evaluator": "generation",
|
|
1852
|
-
"extractor_location": "lm_eval_pairs",
|
|
1853
|
-
"extractor_file": "noreval_gen",
|
|
1854
|
-
"benchmark_type": "other",
|
|
1855
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1856
|
-
},
|
|
1857
|
-
"noreval_mc": {
|
|
1858
|
-
"evaluator": "log_likelihoods",
|
|
1859
|
-
"extractor_location": "lm_eval_pairs",
|
|
1860
|
-
"extractor_file": "noreval_mc",
|
|
1861
|
-
"benchmark_type": "other",
|
|
1862
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1863
|
-
},
|
|
1864
|
-
"noticia": {
|
|
1865
|
-
"evaluator": "generation",
|
|
1866
|
-
"extractor_location": "huggingface_pairs",
|
|
1867
|
-
"extractor_file": "noticia",
|
|
1868
|
-
"benchmark_type": "other",
|
|
1869
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1870
|
-
},
|
|
1871
|
-
"nq_open": {
|
|
1872
|
-
"evaluator": "generation",
|
|
1873
|
-
"extractor_location": "lm_eval_pairs",
|
|
1874
|
-
"extractor_file": "nq_open",
|
|
1875
|
-
"benchmark_type": "other",
|
|
1876
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1877
|
-
},
|
|
1878
|
-
"okapi": {
|
|
1879
|
-
"evaluator": "log_likelihoods",
|
|
1880
|
-
"extractor_location": "lm_eval_pairs",
|
|
1881
|
-
"extractor_file": "okapi",
|
|
1882
|
-
"benchmark_type": "other",
|
|
1883
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1884
|
-
},
|
|
1885
|
-
"okapi_arc_multilingual": {
|
|
1886
|
-
"evaluator": "log_likelihoods",
|
|
1887
|
-
"extractor_location": "lm_eval_pairs",
|
|
1888
|
-
"extractor_file": "okapi_arc_multilingual",
|
|
1889
|
-
"benchmark_type": "knowledge",
|
|
1890
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1891
|
-
},
|
|
1892
|
-
"okapi_hellaswag_multilingual": {
|
|
1893
|
-
"evaluator": "log_likelihoods",
|
|
1894
|
-
"extractor_location": "lm_eval_pairs",
|
|
1895
|
-
"extractor_file": "okapi_hellaswag_multilingual",
|
|
1896
|
-
"benchmark_type": "knowledge",
|
|
1897
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1898
|
-
},
|
|
1899
|
-
"okapi_mmlu_multilingual": {
|
|
1900
|
-
"evaluator": "log_likelihoods",
|
|
1901
|
-
"extractor_location": "lm_eval_pairs",
|
|
1902
|
-
"extractor_file": "okapi_mmlu_multilingual",
|
|
1903
|
-
"benchmark_type": "knowledge",
|
|
1904
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1905
|
-
},
|
|
1906
|
-
"okapi_truthfulqa_multilingual": {
|
|
1907
|
-
"evaluator": "log_likelihoods",
|
|
1908
|
-
"extractor_location": "lm_eval_pairs",
|
|
1909
|
-
"extractor_file": "okapi_truthfulqa_multilingual",
|
|
1910
|
-
"benchmark_type": "question_answering",
|
|
1911
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1912
|
-
},
|
|
1913
|
-
"olaph": {
|
|
1914
|
-
"evaluator": "generation",
|
|
1915
|
-
"extractor_location": "lm_eval_pairs",
|
|
1916
|
-
"extractor_file": "olaph",
|
|
1917
|
-
"benchmark_type": "other",
|
|
1918
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
1919
|
-
},
|
|
1920
|
-
"openbookqa": {
|
|
1921
|
-
"evaluator": "log_likelihoods",
|
|
1922
|
-
"extractor_location": "lm_eval_pairs",
|
|
1923
|
-
"extractor_file": "openbookqa",
|
|
1924
|
-
"benchmark_type": "question_answering",
|
|
1925
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1926
|
-
},
|
|
1927
|
-
"openllm": {
|
|
1928
|
-
"evaluator": "log_likelihoods",
|
|
1929
|
-
"extractor_location": "huggingface_pairs",
|
|
1930
|
-
"extractor_file": "openllm",
|
|
1931
|
-
"benchmark_type": "other",
|
|
1932
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1933
|
-
},
|
|
1934
|
-
"option": {
|
|
1935
|
-
"evaluator": null,
|
|
1936
|
-
"extractor_location": "lm_eval_pairs",
|
|
1937
|
-
"extractor_file": "option",
|
|
1938
|
-
"benchmark_type": "other",
|
|
1939
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1940
|
-
},
|
|
1941
|
-
"paloma": {
|
|
1942
|
-
"evaluator": "perplexity",
|
|
1943
|
-
"extractor_location": "lm_eval_pairs",
|
|
1944
|
-
"extractor_file": "paloma",
|
|
1945
|
-
"benchmark_type": "other",
|
|
1946
|
-
"explanation": "Perplexity measurement - evaluates model's prediction confidence"
|
|
1947
|
-
},
|
|
1948
|
-
"parafraseja": {
|
|
1949
|
-
"evaluator": "log_likelihoods",
|
|
1950
|
-
"extractor_location": "lm_eval_pairs",
|
|
1951
|
-
"extractor_file": "parafraseja",
|
|
1952
|
-
"benchmark_type": "other",
|
|
1953
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1954
|
-
},
|
|
1955
|
-
"parafrases": {
|
|
1956
|
-
"evaluator": "log_likelihoods",
|
|
1957
|
-
"extractor_location": "lm_eval_pairs",
|
|
1958
|
-
"extractor_file": "parafrases",
|
|
1959
|
-
"benchmark_type": "other",
|
|
1960
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1961
|
-
},
|
|
1962
|
-
"paws": {
|
|
1963
|
-
"evaluator": null,
|
|
1964
|
-
"extractor_location": "lm_eval_pairs",
|
|
1965
|
-
"extractor_file": "paws",
|
|
1966
|
-
"benchmark_type": "other",
|
|
1967
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1968
|
-
},
|
|
1969
|
-
"paws_x": {
|
|
1970
|
-
"evaluator": "log_likelihoods",
|
|
1971
|
-
"extractor_location": "lm_eval_pairs",
|
|
1972
|
-
"extractor_file": "paws_x",
|
|
1973
|
-
"benchmark_type": "other",
|
|
1974
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1975
|
-
},
|
|
1976
|
-
"pawsx": {
|
|
1977
|
-
"evaluator": "log_likelihoods",
|
|
1978
|
-
"extractor_location": "lm_eval_pairs",
|
|
1979
|
-
"extractor_file": "pawsx",
|
|
1980
|
-
"benchmark_type": "other",
|
|
1981
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1982
|
-
},
|
|
1983
|
-
"persona": {
|
|
1984
|
-
"evaluator": "log_likelihoods",
|
|
1985
|
-
"extractor_location": "lm_eval_pairs",
|
|
1986
|
-
"extractor_file": "persona",
|
|
1987
|
-
"benchmark_type": "other",
|
|
1988
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
1989
|
-
},
|
|
1990
|
-
"phrases": {
|
|
1991
|
-
"evaluator": null,
|
|
1992
|
-
"extractor_location": "lm_eval_pairs",
|
|
1993
|
-
"extractor_file": "phrases",
|
|
1994
|
-
"benchmark_type": "other",
|
|
1995
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
1996
|
-
},
|
|
1997
|
-
"pile": {
|
|
1998
|
-
"evaluator": "exact_match",
|
|
1999
|
-
"extractor_location": "lm_eval_pairs",
|
|
2000
|
-
"extractor_file": "pile",
|
|
2001
|
-
"benchmark_type": "other",
|
|
2002
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
2003
|
-
},
|
|
2004
|
-
"pile_10k": {
|
|
2005
|
-
"evaluator": "generation",
|
|
2006
|
-
"extractor_location": "lm_eval_pairs",
|
|
2007
|
-
"extractor_file": "pile_10k",
|
|
2008
|
-
"benchmark_type": "other",
|
|
2009
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2010
|
-
},
|
|
2011
|
-
"piqa": {
|
|
2012
|
-
"evaluator": "log_likelihoods",
|
|
2013
|
-
"extractor_location": "lm_eval_pairs",
|
|
2014
|
-
"extractor_file": "piqa",
|
|
2015
|
-
"benchmark_type": "question_answering",
|
|
2016
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2017
|
-
},
|
|
2018
|
-
"polemo2": {
|
|
2019
|
-
"evaluator": "generation",
|
|
2020
|
-
"extractor_location": "lm_eval_pairs",
|
|
2021
|
-
"extractor_file": "polemo2",
|
|
2022
|
-
"benchmark_type": "other",
|
|
2023
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2024
|
-
},
|
|
2025
|
-
"polymath": {
|
|
2026
|
-
"evaluator": null,
|
|
2027
|
-
"extractor_location": "huggingface_pairs",
|
|
2028
|
-
"extractor_file": "polymath",
|
|
2029
|
-
"benchmark_type": "mathematics",
|
|
2030
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2031
|
-
},
|
|
2032
|
-
"polymath_en_high": {
|
|
2033
|
-
"evaluator": null,
|
|
2034
|
-
"extractor_location": "huggingface_pairs",
|
|
2035
|
-
"extractor_file": "polymath_configs",
|
|
2036
|
-
"benchmark_type": "mathematics",
|
|
2037
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2038
|
-
},
|
|
2039
|
-
"polymath_en_medium": {
|
|
2040
|
-
"evaluator": null,
|
|
2041
|
-
"extractor_location": "huggingface_pairs",
|
|
2042
|
-
"extractor_file": "polymath_configs",
|
|
2043
|
-
"benchmark_type": "mathematics",
|
|
2044
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2045
|
-
},
|
|
2046
|
-
"polymath_zh_high": {
|
|
2047
|
-
"evaluator": null,
|
|
2048
|
-
"extractor_location": "huggingface_pairs",
|
|
2049
|
-
"extractor_file": "polymath_configs",
|
|
2050
|
-
"benchmark_type": "mathematics",
|
|
2051
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2052
|
-
},
|
|
2053
|
-
"polymath_zh_medium": {
|
|
2054
|
-
"evaluator": null,
|
|
2055
|
-
"extractor_location": "huggingface_pairs",
|
|
2056
|
-
"extractor_file": "polymath_configs",
|
|
2057
|
-
"benchmark_type": "mathematics",
|
|
2058
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2059
|
-
},
|
|
2060
|
-
"portuguese_bench": {
|
|
2061
|
-
"evaluator": "log_likelihoods",
|
|
2062
|
-
"extractor_location": "lm_eval_pairs",
|
|
2063
|
-
"extractor_file": "portuguese_bench",
|
|
2064
|
-
"benchmark_type": "other",
|
|
2065
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2066
|
-
},
|
|
2067
|
-
"prompt": {
|
|
2068
|
-
"evaluator": null,
|
|
2069
|
-
"extractor_location": "lm_eval_pairs",
|
|
2070
|
-
"extractor_file": "prompt",
|
|
2071
|
-
"benchmark_type": "other",
|
|
2072
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2073
|
-
},
|
|
2074
|
-
"prost": {
|
|
2075
|
-
"evaluator": null,
|
|
2076
|
-
"extractor_location": "lm_eval_pairs",
|
|
2077
|
-
"extractor_file": "prost",
|
|
2078
|
-
"benchmark_type": "other",
|
|
2079
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2080
|
-
},
|
|
2081
|
-
"pubmedqa": {
|
|
2082
|
-
"evaluator": null,
|
|
2083
|
-
"extractor_location": "lm_eval_pairs",
|
|
2084
|
-
"extractor_file": "pubmedqa",
|
|
2085
|
-
"benchmark_type": "question_answering",
|
|
2086
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2087
|
-
},
|
|
2088
|
-
"pythia": {
|
|
2089
|
-
"evaluator": "log_likelihoods",
|
|
2090
|
-
"extractor_location": "huggingface_pairs",
|
|
2091
|
-
"extractor_file": "pythia",
|
|
2092
|
-
"benchmark_type": "other",
|
|
2093
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2094
|
-
},
|
|
2095
|
-
"qa4mre": {
|
|
2096
|
-
"evaluator": "log_likelihoods",
|
|
2097
|
-
"extractor_location": "lm_eval_pairs",
|
|
2098
|
-
"extractor_file": "qa4mre",
|
|
2099
|
-
"benchmark_type": "question_answering",
|
|
2100
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2101
|
-
},
|
|
2102
|
-
"qasper": {
|
|
2103
|
-
"evaluator": "generation",
|
|
2104
|
-
"extractor_location": "lm_eval_pairs",
|
|
2105
|
-
"extractor_file": "qasper",
|
|
2106
|
-
"benchmark_type": "question_answering",
|
|
2107
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2108
|
-
},
|
|
2109
|
-
"qnli": {
|
|
2110
|
-
"evaluator": null,
|
|
2111
|
-
"extractor_location": "lm_eval_pairs",
|
|
2112
|
-
"extractor_file": "qnli",
|
|
2113
|
-
"benchmark_type": "other",
|
|
2114
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2115
|
-
},
|
|
2116
|
-
"qnlieu": {
|
|
2117
|
-
"evaluator": null,
|
|
2118
|
-
"extractor_location": "lm_eval_pairs",
|
|
2119
|
-
"extractor_file": "qnlieu",
|
|
2120
|
-
"benchmark_type": "other",
|
|
2121
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2122
|
-
},
|
|
2123
|
-
"qqp": {
|
|
2124
|
-
"evaluator": null,
|
|
2125
|
-
"extractor_location": "lm_eval_pairs",
|
|
2126
|
-
"extractor_file": "qqp",
|
|
2127
|
-
"benchmark_type": "other",
|
|
2128
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2129
|
-
},
|
|
2130
|
-
"quac": {
|
|
2131
|
-
"evaluator": null,
|
|
2132
|
-
"extractor_location": "lm_eval_pairs",
|
|
2133
|
-
"extractor_file": "quac",
|
|
2134
|
-
"benchmark_type": "other",
|
|
2135
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2136
|
-
},
|
|
2137
|
-
"race": {
|
|
2138
|
-
"evaluator": "log_likelihoods",
|
|
2139
|
-
"extractor_location": "lm_eval_pairs",
|
|
2140
|
-
"extractor_file": "race",
|
|
2141
|
-
"benchmark_type": "other",
|
|
2142
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2143
|
-
},
|
|
2144
|
-
"random": {
|
|
2145
|
-
"evaluator": null,
|
|
2146
|
-
"extractor_location": "lm_eval_pairs",
|
|
2147
|
-
"extractor_file": "random",
|
|
2148
|
-
"benchmark_type": "other",
|
|
2149
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2150
|
-
},
|
|
2151
|
-
"realtoxicityprompts": {
|
|
2152
|
-
"evaluator": "generation",
|
|
2153
|
-
"extractor_location": "lm_eval_pairs",
|
|
2154
|
-
"extractor_file": "realtoxicityprompts",
|
|
2155
|
-
"benchmark_type": "other",
|
|
2156
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2157
|
-
},
|
|
2158
|
-
"recode": {
|
|
2159
|
-
"evaluator": null,
|
|
2160
|
-
"extractor_location": "huggingface_pairs",
|
|
2161
|
-
"extractor_file": "recode",
|
|
2162
|
-
"benchmark_type": "coding",
|
|
2163
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2164
|
-
},
|
|
2165
|
-
"record": {
|
|
2166
|
-
"evaluator": null,
|
|
2167
|
-
"extractor_location": "huggingface_pairs",
|
|
2168
|
-
"extractor_file": "record",
|
|
2169
|
-
"benchmark_type": "other",
|
|
2170
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2171
|
-
},
|
|
2172
|
-
"reversed": {
|
|
2173
|
-
"evaluator": "exact_match",
|
|
2174
|
-
"extractor_location": "lm_eval_pairs",
|
|
2175
|
-
"extractor_file": "reversed",
|
|
2176
|
-
"benchmark_type": "other",
|
|
2177
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
2178
|
-
},
|
|
2179
|
-
"rte": {
|
|
2180
|
-
"evaluator": null,
|
|
2181
|
-
"extractor_location": "lm_eval_pairs",
|
|
2182
|
-
"extractor_file": "rte",
|
|
2183
|
-
"benchmark_type": "other",
|
|
2184
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2185
|
-
},
|
|
2186
|
-
"ruler": {
|
|
2187
|
-
"evaluator": null,
|
|
2188
|
-
"extractor_location": "lm_eval_pairs",
|
|
2189
|
-
"extractor_file": "ruler",
|
|
2190
|
-
"benchmark_type": "other",
|
|
2191
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2192
|
-
},
|
|
2193
|
-
"sciq": {
|
|
2194
|
-
"evaluator": "log_likelihoods",
|
|
2195
|
-
"extractor_location": "lm_eval_pairs",
|
|
2196
|
-
"extractor_file": "sciq",
|
|
2197
|
-
"benchmark_type": "other",
|
|
2198
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2199
|
-
},
|
|
2200
|
-
"score": {
|
|
2201
|
-
"evaluator": "log_likelihoods",
|
|
2202
|
-
"extractor_location": "lm_eval_pairs",
|
|
2203
|
-
"extractor_file": "score",
|
|
2204
|
-
"benchmark_type": "other",
|
|
2205
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2206
|
-
},
|
|
2207
|
-
"scrolls": {
|
|
2208
|
-
"evaluator": "generation",
|
|
2209
|
-
"extractor_location": "lm_eval_pairs",
|
|
2210
|
-
"extractor_file": "scrolls",
|
|
2211
|
-
"benchmark_type": "other",
|
|
2212
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2213
|
-
},
|
|
2214
|
-
"self": {
|
|
2215
|
-
"evaluator": "log_likelihoods",
|
|
2216
|
-
"extractor_location": "lm_eval_pairs",
|
|
2217
|
-
"extractor_file": "self",
|
|
2218
|
-
"benchmark_type": "other",
|
|
2219
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2220
|
-
},
|
|
2221
|
-
"sglue": {
|
|
2222
|
-
"evaluator": null,
|
|
2223
|
-
"extractor_location": "lm_eval_pairs",
|
|
2224
|
-
"extractor_file": "sglue",
|
|
2225
|
-
"benchmark_type": "other",
|
|
2226
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2227
|
-
},
|
|
2228
|
-
"simple_cooccurrence_bias": {
|
|
2229
|
-
"evaluator": null,
|
|
2230
|
-
"extractor_location": "lm_eval_pairs",
|
|
2231
|
-
"extractor_file": "simple_cooccurrence_bias",
|
|
2232
|
-
"benchmark_type": "other",
|
|
2233
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2234
|
-
},
|
|
2235
|
-
"siqa": {
|
|
2236
|
-
"evaluator": "log_likelihoods",
|
|
2237
|
-
"extractor_location": "lm_eval_pairs",
|
|
2238
|
-
"extractor_file": "siqa",
|
|
2239
|
-
"benchmark_type": "question_answering",
|
|
2240
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2241
|
-
},
|
|
2242
|
-
"social_iqa": {
|
|
2243
|
-
"evaluator": null,
|
|
2244
|
-
"extractor_location": "lm_eval_pairs",
|
|
2245
|
-
"extractor_file": "social_iqa",
|
|
2246
|
-
"benchmark_type": "question_answering",
|
|
2247
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2248
|
-
},
|
|
2249
|
-
"spanish_bench": {
|
|
2250
|
-
"evaluator": "log_likelihoods",
|
|
2251
|
-
"extractor_location": "lm_eval_pairs",
|
|
2252
|
-
"extractor_file": "spanish_bench",
|
|
2253
|
-
"benchmark_type": "other",
|
|
2254
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2255
|
-
},
|
|
2256
|
-
"squad2": {
|
|
2257
|
-
"evaluator": null,
|
|
2258
|
-
"extractor_location": "huggingface_pairs",
|
|
2259
|
-
"extractor_file": "squad2",
|
|
2260
|
-
"benchmark_type": "question_answering",
|
|
2261
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2262
|
-
},
|
|
2263
|
-
"squad_completion": {
|
|
2264
|
-
"evaluator": "exact_match",
|
|
2265
|
-
"extractor_location": "lm_eval_pairs",
|
|
2266
|
-
"extractor_file": "squad_completion",
|
|
2267
|
-
"benchmark_type": "question_answering",
|
|
2268
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
2269
|
-
},
|
|
2270
|
-
"sst2": {
|
|
2271
|
-
"evaluator": null,
|
|
2272
|
-
"extractor_location": "lm_eval_pairs",
|
|
2273
|
-
"extractor_file": "sst2",
|
|
2274
|
-
"benchmark_type": "other",
|
|
2275
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2276
|
-
},
|
|
2277
|
-
"storycloze": {
|
|
2278
|
-
"evaluator": "log_likelihoods",
|
|
2279
|
-
"extractor_location": "lm_eval_pairs",
|
|
2280
|
-
"extractor_file": "storycloze",
|
|
2281
|
-
"benchmark_type": "other",
|
|
2282
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2283
|
-
},
|
|
2284
|
-
"stsb": {
|
|
2285
|
-
"evaluator": null,
|
|
2286
|
-
"extractor_location": "huggingface_pairs",
|
|
2287
|
-
"extractor_file": "stsb",
|
|
2288
|
-
"benchmark_type": "other",
|
|
2289
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2290
|
-
},
|
|
2291
|
-
"summarization": {
|
|
2292
|
-
"evaluator": null,
|
|
2293
|
-
"extractor_location": "lm_eval_pairs",
|
|
2294
|
-
"extractor_file": "summarization",
|
|
2295
|
-
"benchmark_type": "other",
|
|
2296
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2297
|
-
},
|
|
2298
|
-
"super": {
|
|
2299
|
-
"evaluator": null,
|
|
2300
|
-
"extractor_location": "lm_eval_pairs",
|
|
2301
|
-
"extractor_file": "super",
|
|
2302
|
-
"benchmark_type": "other",
|
|
2303
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2304
|
-
},
|
|
2305
|
-
"super_glue": {
|
|
2306
|
-
"evaluator": "log_likelihoods",
|
|
2307
|
-
"extractor_location": "lm_eval_pairs",
|
|
2308
|
-
"extractor_file": "super_glue",
|
|
2309
|
-
"benchmark_type": "other",
|
|
2310
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2311
|
-
},
|
|
2312
|
-
"super_glue_lm_eval_v1": {
|
|
2313
|
-
"evaluator": "log_likelihoods",
|
|
2314
|
-
"extractor_location": "huggingface_pairs",
|
|
2315
|
-
"extractor_file": "super_glue_lm_eval_v1",
|
|
2316
|
-
"benchmark_type": "other",
|
|
2317
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2318
|
-
},
|
|
2319
|
-
"super_glue_lm_eval_v1_seq2seq": {
|
|
2320
|
-
"evaluator": "generation",
|
|
2321
|
-
"extractor_location": "huggingface_pairs",
|
|
2322
|
-
"extractor_file": "super_glue_lm_eval_v1_seq2seq",
|
|
2323
|
-
"benchmark_type": "other",
|
|
2324
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2325
|
-
},
|
|
2326
|
-
"super_glue_t5_prompt": {
|
|
2327
|
-
"evaluator": "generation",
|
|
2328
|
-
"extractor_location": "huggingface_pairs",
|
|
2329
|
-
"extractor_file": "super_glue_t5_prompt",
|
|
2330
|
-
"benchmark_type": "other",
|
|
2331
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2332
|
-
},
|
|
2333
|
-
"super_gpqa": {
|
|
2334
|
-
"evaluator": null,
|
|
2335
|
-
"extractor_location": "huggingface_pairs",
|
|
2336
|
-
"extractor_file": "super_gpqa",
|
|
2337
|
-
"benchmark_type": "question_answering",
|
|
2338
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2339
|
-
},
|
|
2340
|
-
"superglue": {
|
|
2341
|
-
"evaluator": null,
|
|
2342
|
-
"extractor_location": "lm_eval_pairs",
|
|
2343
|
-
"extractor_file": "superglue",
|
|
2344
|
-
"benchmark_type": "other",
|
|
2345
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2346
|
-
},
|
|
2347
|
-
"supergpqa": {
|
|
2348
|
-
"evaluator": null,
|
|
2349
|
-
"extractor_location": "lm_eval_pairs",
|
|
2350
|
-
"extractor_file": "supergpqa",
|
|
2351
|
-
"benchmark_type": "question_answering",
|
|
2352
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2353
|
-
},
|
|
2354
|
-
"supergpqa_biology": {
|
|
2355
|
-
"evaluator": null,
|
|
2356
|
-
"extractor_location": "huggingface_pairs",
|
|
2357
|
-
"extractor_file": "super_gpqa",
|
|
2358
|
-
"benchmark_type": "question_answering",
|
|
2359
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2360
|
-
},
|
|
2361
|
-
"supergpqa_chemistry": {
|
|
2362
|
-
"evaluator": null,
|
|
2363
|
-
"extractor_location": "huggingface_pairs",
|
|
2364
|
-
"extractor_file": "super_gpqa",
|
|
2365
|
-
"benchmark_type": "question_answering",
|
|
2366
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2367
|
-
},
|
|
2368
|
-
"supergpqa_physics": {
|
|
2369
|
-
"evaluator": null,
|
|
2370
|
-
"extractor_location": "huggingface_pairs",
|
|
2371
|
-
"extractor_file": "super_gpqa",
|
|
2372
|
-
"benchmark_type": "question_answering",
|
|
2373
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2374
|
-
},
|
|
2375
|
-
"swag": {
|
|
2376
|
-
"evaluator": "log_likelihoods",
|
|
2377
|
-
"extractor_location": "lm_eval_pairs",
|
|
2378
|
-
"extractor_file": "swag",
|
|
2379
|
-
"benchmark_type": "other",
|
|
2380
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2381
|
-
},
|
|
2382
|
-
"swde": {
|
|
2383
|
-
"evaluator": null,
|
|
2384
|
-
"extractor_location": "lm_eval_pairs",
|
|
2385
|
-
"extractor_file": "swde",
|
|
2386
|
-
"benchmark_type": "other",
|
|
2387
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2388
|
-
},
|
|
2389
|
-
"sycophancy": {
|
|
2390
|
-
"evaluator": "log_likelihoods",
|
|
2391
|
-
"extractor_location": "lm_eval_pairs",
|
|
2392
|
-
"extractor_file": "sycophancy",
|
|
2393
|
-
"benchmark_type": "other",
|
|
2394
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2395
|
-
},
|
|
2396
|
-
"t0": {
|
|
2397
|
-
"evaluator": "generation",
|
|
2398
|
-
"extractor_location": "lm_eval_pairs",
|
|
2399
|
-
"extractor_file": "t0",
|
|
2400
|
-
"benchmark_type": "other",
|
|
2401
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2402
|
-
},
|
|
2403
|
-
"tag": {
|
|
2404
|
-
"evaluator": null,
|
|
2405
|
-
"extractor_location": "huggingface_pairs",
|
|
2406
|
-
"extractor_file": "tag",
|
|
2407
|
-
"benchmark_type": "other",
|
|
2408
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2409
|
-
},
|
|
2410
|
-
"teca": {
|
|
2411
|
-
"evaluator": "log_likelihoods",
|
|
2412
|
-
"extractor_location": "lm_eval_pairs",
|
|
2413
|
-
"extractor_file": "teca",
|
|
2414
|
-
"benchmark_type": "other",
|
|
2415
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2416
|
-
},
|
|
2417
|
-
"tinyarc": {
|
|
2418
|
-
"evaluator": "log_likelihoods",
|
|
2419
|
-
"extractor_location": "lm_eval_pairs",
|
|
2420
|
-
"extractor_file": "tinyarc",
|
|
2421
|
-
"benchmark_type": "knowledge",
|
|
2422
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2423
|
-
},
|
|
2424
|
-
"tinybenchmarks": {
|
|
2425
|
-
"evaluator": "log_likelihoods",
|
|
2426
|
-
"extractor_location": "lm_eval_pairs",
|
|
2427
|
-
"extractor_file": "tinybenchmarks",
|
|
2428
|
-
"benchmark_type": "other",
|
|
2429
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2430
|
-
},
|
|
2431
|
-
"tinygsm8k": {
|
|
2432
|
-
"evaluator": "generation",
|
|
2433
|
-
"extractor_location": "lm_eval_pairs",
|
|
2434
|
-
"extractor_file": "tinygsm8k",
|
|
2435
|
-
"benchmark_type": "mathematics",
|
|
2436
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2437
|
-
},
|
|
2438
|
-
"tinyhellaswag": {
|
|
2439
|
-
"evaluator": "log_likelihoods",
|
|
2440
|
-
"extractor_location": "lm_eval_pairs",
|
|
2441
|
-
"extractor_file": "tinyhellaswag",
|
|
2442
|
-
"benchmark_type": "knowledge",
|
|
2443
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2444
|
-
},
|
|
2445
|
-
"tinymmlu": {
|
|
2446
|
-
"evaluator": "log_likelihoods",
|
|
2447
|
-
"extractor_location": "lm_eval_pairs",
|
|
2448
|
-
"extractor_file": "tinymmlu",
|
|
2449
|
-
"benchmark_type": "knowledge",
|
|
2450
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2451
|
-
},
|
|
2452
|
-
"tinytruthfulqa": {
|
|
2453
|
-
"evaluator": "log_likelihoods",
|
|
2454
|
-
"extractor_location": "lm_eval_pairs",
|
|
2455
|
-
"extractor_file": "tinytruthfulqa",
|
|
2456
|
-
"benchmark_type": "question_answering",
|
|
2457
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2458
|
-
},
|
|
2459
|
-
"tinywinogrande": {
|
|
2460
|
-
"evaluator": "log_likelihoods",
|
|
2461
|
-
"extractor_location": "lm_eval_pairs",
|
|
2462
|
-
"extractor_file": "tinywinogrande",
|
|
2463
|
-
"benchmark_type": "other",
|
|
2464
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2465
|
-
},
|
|
2466
|
-
"tmlu": {
|
|
2467
|
-
"evaluator": "log_likelihoods",
|
|
2468
|
-
"extractor_location": "huggingface_pairs",
|
|
2469
|
-
"extractor_file": "tmlu",
|
|
2470
|
-
"benchmark_type": "other",
|
|
2471
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2472
|
-
},
|
|
2473
|
-
"tmmluplus": {
|
|
2474
|
-
"evaluator": "log_likelihoods",
|
|
2475
|
-
"extractor_location": "lm_eval_pairs",
|
|
2476
|
-
"extractor_file": "tmmluplus",
|
|
2477
|
-
"benchmark_type": "knowledge",
|
|
2478
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2479
|
-
},
|
|
2480
|
-
"toxigen": {
|
|
2481
|
-
"evaluator": "log_likelihoods",
|
|
2482
|
-
"extractor_location": "lm_eval_pairs",
|
|
2483
|
-
"extractor_file": "toxigen",
|
|
2484
|
-
"benchmark_type": "other",
|
|
2485
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2486
|
-
},
|
|
2487
|
-
"translation": {
|
|
2488
|
-
"evaluator": "generation",
|
|
2489
|
-
"extractor_location": "lm_eval_pairs",
|
|
2490
|
-
"extractor_file": "translation",
|
|
2491
|
-
"benchmark_type": "translation",
|
|
2492
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2493
|
-
},
|
|
2494
|
-
"triviaqa": {
|
|
2495
|
-
"evaluator": "generation",
|
|
2496
|
-
"extractor_location": "lm_eval_pairs",
|
|
2497
|
-
"extractor_file": "triviaqa",
|
|
2498
|
-
"benchmark_type": "question_answering",
|
|
2499
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2500
|
-
},
|
|
2501
|
-
"truthfulqa": {
|
|
2502
|
-
"evaluator": "log_likelihoods",
|
|
2503
|
-
"extractor_location": "lm_eval_pairs",
|
|
2504
|
-
"extractor_file": "truthfulqa",
|
|
2505
|
-
"benchmark_type": "question_answering",
|
|
2506
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2507
|
-
},
|
|
2508
|
-
"truthfulqa_gen": {
|
|
2509
|
-
"evaluator": "generation",
|
|
2510
|
-
"extractor_location": "lm_eval_pairs",
|
|
2511
|
-
"extractor_file": "truthfulqa_gen",
|
|
2512
|
-
"benchmark_type": "question_answering",
|
|
2513
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2514
|
-
},
|
|
2515
|
-
"truthfulqa_mc1": {
|
|
2516
|
-
"evaluator": "log_likelihoods",
|
|
2517
|
-
"extractor_location": "lm_eval_pairs",
|
|
2518
|
-
"extractor_file": "truthfulqa_mc1",
|
|
2519
|
-
"benchmark_type": "question_answering",
|
|
2520
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2521
|
-
},
|
|
2522
|
-
"truthfulqa_mc2": {
|
|
2523
|
-
"evaluator": "log_likelihoods",
|
|
2524
|
-
"extractor_location": "lm_eval_pairs",
|
|
2525
|
-
"extractor_file": "truthfulqa_mc2",
|
|
2526
|
-
"benchmark_type": "question_answering",
|
|
2527
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2528
|
-
},
|
|
2529
|
-
"truthfulqa_multi": {
|
|
2530
|
-
"evaluator": "mixed",
|
|
2531
|
-
"extractor_location": "lm_eval_pairs",
|
|
2532
|
-
"extractor_file": "truthfulqa_multi",
|
|
2533
|
-
"benchmark_type": "question_answering",
|
|
2534
|
-
"explanation": "Uses mixed evaluator"
|
|
2535
|
-
},
|
|
2536
|
-
"turblimp_core": {
|
|
2537
|
-
"evaluator": null,
|
|
2538
|
-
"extractor_location": "lm_eval_pairs",
|
|
2539
|
-
"extractor_file": "turblimp_core",
|
|
2540
|
-
"benchmark_type": "other",
|
|
2541
|
-
"explanation": "NO EVALUATOR DEFINED - will fail with error"
|
|
2542
|
-
},
|
|
2543
|
-
"turkishmmlu": {
|
|
2544
|
-
"evaluator": "log_likelihoods",
|
|
2545
|
-
"extractor_location": "lm_eval_pairs",
|
|
2546
|
-
"extractor_file": "turkishmmlu",
|
|
2547
|
-
"benchmark_type": "knowledge",
|
|
2548
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2549
|
-
},
|
|
2550
|
-
"twenty_newsgroups": {
|
|
2551
|
-
"evaluator": "exact_match",
|
|
2552
|
-
"extractor_location": "lm_eval_pairs",
|
|
2553
|
-
"extractor_file": "twenty_newsgroups",
|
|
2554
|
-
"benchmark_type": "other",
|
|
2555
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
2556
|
-
},
|
|
2557
|
-
"unfair": {
|
|
2558
|
-
"evaluator": "generation",
|
|
2559
|
-
"extractor_location": "lm_eval_pairs",
|
|
2560
|
-
"extractor_file": "unfair",
|
|
2561
|
-
"benchmark_type": "other",
|
|
2562
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2563
|
-
},
|
|
2564
|
-
"unitxt": {
|
|
2565
|
-
"evaluator": "generation",
|
|
2566
|
-
"extractor_location": "lm_eval_pairs",
|
|
2567
|
-
"extractor_file": "unitxt",
|
|
2568
|
-
"benchmark_type": "other",
|
|
2569
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2570
|
-
},
|
|
2571
|
-
"unscramble": {
|
|
2572
|
-
"evaluator": "exact_match",
|
|
2573
|
-
"extractor_location": "lm_eval_pairs",
|
|
2574
|
-
"extractor_file": "unscramble",
|
|
2575
|
-
"benchmark_type": "other",
|
|
2576
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
2577
|
-
},
|
|
2578
|
-
"vaxx": {
|
|
2579
|
-
"evaluator": "log_likelihoods",
|
|
2580
|
-
"extractor_location": "lm_eval_pairs",
|
|
2581
|
-
"extractor_file": "vaxx",
|
|
2582
|
-
"benchmark_type": "other",
|
|
2583
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2584
|
-
},
|
|
2585
|
-
"webqs": {
|
|
2586
|
-
"evaluator": "log_likelihoods",
|
|
2587
|
-
"extractor_location": "lm_eval_pairs",
|
|
2588
|
-
"extractor_file": "webqs",
|
|
2589
|
-
"benchmark_type": "other",
|
|
2590
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2591
|
-
},
|
|
2592
|
-
"wic": {
|
|
2593
|
-
"evaluator": "log_likelihoods",
|
|
2594
|
-
"extractor_location": "lm_eval_pairs",
|
|
2595
|
-
"extractor_file": "wic",
|
|
2596
|
-
"benchmark_type": "other",
|
|
2597
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2598
|
-
},
|
|
2599
|
-
"wiceu": {
|
|
2600
|
-
"evaluator": "log_likelihoods",
|
|
2601
|
-
"extractor_location": "huggingface_pairs",
|
|
2602
|
-
"extractor_file": "wiceu",
|
|
2603
|
-
"benchmark_type": "other",
|
|
2604
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2605
|
-
},
|
|
2606
|
-
"wikitext": {
|
|
2607
|
-
"evaluator": "generation",
|
|
2608
|
-
"extractor_location": "lm_eval_pairs",
|
|
2609
|
-
"extractor_file": "wikitext",
|
|
2610
|
-
"benchmark_type": "other",
|
|
2611
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2612
|
-
},
|
|
2613
|
-
"winogender": {
|
|
2614
|
-
"evaluator": "generation",
|
|
2615
|
-
"extractor_location": "lm_eval_pairs",
|
|
2616
|
-
"extractor_file": "winogender",
|
|
2617
|
-
"benchmark_type": "other",
|
|
2618
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2619
|
-
},
|
|
2620
|
-
"winogrande": {
|
|
2621
|
-
"evaluator": "log_likelihoods",
|
|
2622
|
-
"extractor_location": "lm_eval_pairs",
|
|
2623
|
-
"extractor_file": "winogrande",
|
|
2624
|
-
"benchmark_type": "other",
|
|
2625
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2626
|
-
},
|
|
2627
|
-
"wmdp": {
|
|
2628
|
-
"evaluator": "log_likelihoods",
|
|
2629
|
-
"extractor_location": "lm_eval_pairs",
|
|
2630
|
-
"extractor_file": "wmdp",
|
|
2631
|
-
"benchmark_type": "other",
|
|
2632
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2633
|
-
},
|
|
2634
|
-
"wmt14": {
|
|
2635
|
-
"evaluator": "generation",
|
|
2636
|
-
"extractor_location": "lm_eval_pairs",
|
|
2637
|
-
"extractor_file": "wmt14",
|
|
2638
|
-
"benchmark_type": "translation",
|
|
2639
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2640
|
-
},
|
|
2641
|
-
"wmt14_en_fr": {
|
|
2642
|
-
"evaluator": "generation",
|
|
2643
|
-
"extractor_location": "huggingface_pairs",
|
|
2644
|
-
"extractor_file": "wmt14_en_fr",
|
|
2645
|
-
"benchmark_type": "translation",
|
|
2646
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2647
|
-
},
|
|
2648
|
-
"wmt14_fr_en": {
|
|
2649
|
-
"evaluator": "generation",
|
|
2650
|
-
"extractor_location": "huggingface_pairs",
|
|
2651
|
-
"extractor_file": "wmt14_fr_en",
|
|
2652
|
-
"benchmark_type": "translation",
|
|
2653
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2654
|
-
},
|
|
2655
|
-
"wmt16": {
|
|
2656
|
-
"evaluator": "generation",
|
|
2657
|
-
"extractor_location": "lm_eval_pairs",
|
|
2658
|
-
"extractor_file": "wmt16",
|
|
2659
|
-
"benchmark_type": "translation",
|
|
2660
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2661
|
-
},
|
|
2662
|
-
"wmt16_de_en": {
|
|
2663
|
-
"evaluator": "generation",
|
|
2664
|
-
"extractor_location": "huggingface_pairs",
|
|
2665
|
-
"extractor_file": "wmt16_de_en",
|
|
2666
|
-
"benchmark_type": "translation",
|
|
2667
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2668
|
-
},
|
|
2669
|
-
"wmt16_en_de": {
|
|
2670
|
-
"evaluator": "generation",
|
|
2671
|
-
"extractor_location": "huggingface_pairs",
|
|
2672
|
-
"extractor_file": "wmt16_en_de",
|
|
2673
|
-
"benchmark_type": "translation",
|
|
2674
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2675
|
-
},
|
|
2676
|
-
"wmt16_en_ro": {
|
|
2677
|
-
"evaluator": "generation",
|
|
2678
|
-
"extractor_location": "huggingface_pairs",
|
|
2679
|
-
"extractor_file": "wmt16_en_ro",
|
|
2680
|
-
"benchmark_type": "translation",
|
|
2681
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2682
|
-
},
|
|
2683
|
-
"wmt16_ro_en": {
|
|
2684
|
-
"evaluator": "generation",
|
|
2685
|
-
"extractor_location": "huggingface_pairs",
|
|
2686
|
-
"extractor_file": "wmt16_ro_en",
|
|
2687
|
-
"benchmark_type": "translation",
|
|
2688
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2689
|
-
},
|
|
2690
|
-
"wmt_ro_en_t5_prompt": {
|
|
2691
|
-
"evaluator": "generation",
|
|
2692
|
-
"extractor_location": "huggingface_pairs",
|
|
2693
|
-
"extractor_file": "wmt_ro_en_t5_prompt",
|
|
2694
|
-
"benchmark_type": "translation",
|
|
2695
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2696
|
-
},
|
|
2697
|
-
"wnli": {
|
|
2698
|
-
"evaluator": "log_likelihoods",
|
|
2699
|
-
"extractor_location": "lm_eval_pairs",
|
|
2700
|
-
"extractor_file": "wnli",
|
|
2701
|
-
"benchmark_type": "other",
|
|
2702
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2703
|
-
},
|
|
2704
|
-
"wsc": {
|
|
2705
|
-
"evaluator": "log_likelihoods",
|
|
2706
|
-
"extractor_location": "lm_eval_pairs",
|
|
2707
|
-
"extractor_file": "wsc",
|
|
2708
|
-
"benchmark_type": "other",
|
|
2709
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2710
|
-
},
|
|
2711
|
-
"wsc273": {
|
|
2712
|
-
"evaluator": "log_likelihoods",
|
|
2713
|
-
"extractor_location": "lm_eval_pairs",
|
|
2714
|
-
"extractor_file": "wsc273",
|
|
2715
|
-
"benchmark_type": "other",
|
|
2716
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2717
|
-
},
|
|
2718
|
-
"xcopa": {
|
|
2719
|
-
"evaluator": "log_likelihoods",
|
|
2720
|
-
"extractor_location": "lm_eval_pairs",
|
|
2721
|
-
"extractor_file": "xcopa",
|
|
2722
|
-
"benchmark_type": "other",
|
|
2723
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2724
|
-
},
|
|
2725
|
-
"xlsum": {
|
|
2726
|
-
"evaluator": "generation",
|
|
2727
|
-
"extractor_location": "lm_eval_pairs",
|
|
2728
|
-
"extractor_file": "xlsum",
|
|
2729
|
-
"benchmark_type": "other",
|
|
2730
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2731
|
-
},
|
|
2732
|
-
"xnli": {
|
|
2733
|
-
"evaluator": "log_likelihoods",
|
|
2734
|
-
"extractor_location": "lm_eval_pairs",
|
|
2735
|
-
"extractor_file": "xnli",
|
|
2736
|
-
"benchmark_type": "other",
|
|
2737
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2738
|
-
},
|
|
2739
|
-
"xquad": {
|
|
2740
|
-
"evaluator": "generation",
|
|
2741
|
-
"extractor_location": "lm_eval_pairs",
|
|
2742
|
-
"extractor_file": "xquad",
|
|
2743
|
-
"benchmark_type": "other",
|
|
2744
|
-
"explanation": "Text generation evaluation - assesses quality of generated text"
|
|
2745
|
-
},
|
|
2746
|
-
"xstorycloze": {
|
|
2747
|
-
"evaluator": "log_likelihoods",
|
|
2748
|
-
"extractor_location": "lm_eval_pairs",
|
|
2749
|
-
"extractor_file": "xstorycloze",
|
|
2750
|
-
"benchmark_type": "other",
|
|
2751
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2752
|
-
},
|
|
2753
|
-
"xsum": {
|
|
2754
|
-
"evaluator": "exact_match",
|
|
2755
|
-
"extractor_location": "huggingface_pairs",
|
|
2756
|
-
"extractor_file": "xsum",
|
|
2757
|
-
"benchmark_type": "other",
|
|
2758
|
-
"explanation": "Text comparison - checks if generated answer exactly matches expected answer"
|
|
2759
|
-
},
|
|
2760
|
-
"xwinograd": {
|
|
2761
|
-
"evaluator": "log_likelihoods",
|
|
2762
|
-
"extractor_location": "lm_eval_pairs",
|
|
2763
|
-
"extractor_file": "xwinograd",
|
|
2764
|
-
"benchmark_type": "other",
|
|
2765
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2766
|
-
},
|
|
2767
|
-
"yahoo": {
|
|
2768
|
-
"evaluator": "log_likelihoods",
|
|
2769
|
-
"extractor_location": "lm_eval_pairs",
|
|
2770
|
-
"extractor_file": "yahoo",
|
|
2771
|
-
"benchmark_type": "other",
|
|
2772
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2773
|
-
},
|
|
2774
|
-
"zhoblimp": {
|
|
2775
|
-
"evaluator": "log_likelihoods",
|
|
2776
|
-
"extractor_location": "lm_eval_pairs",
|
|
2777
|
-
"extractor_file": "zhoblimp",
|
|
2778
|
-
"benchmark_type": "other",
|
|
2779
|
-
"explanation": "Probability scoring - compares model's confidence in correct vs incorrect answers"
|
|
2780
|
-
}
|
|
2781
|
-
}
|