wisent 0.7.379__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +64 -0
- wisent/cli.py +114 -0
- wisent/core/__init__.py +40 -0
- wisent/core/activations/__init__.py +26 -0
- wisent/core/activations/activations.py +97 -0
- wisent/core/activations/activations_collector.py +506 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +219 -0
- wisent/core/activations/prompt_construction_strategy.py +47 -0
- wisent/core/adapters/__init__.py +22 -0
- wisent/core/adapters/audio.py +616 -0
- wisent/core/adapters/base.py +420 -0
- wisent/core/adapters/multimodal.py +738 -0
- wisent/core/adapters/robotics.py +643 -0
- wisent/core/adapters/text.py +441 -0
- wisent/core/adapters/video.py +555 -0
- wisent/core/agent/__init__.py +1 -0
- wisent/core/agent/budget.py +644 -0
- wisent/core/agent/device_benchmarks.py +691 -0
- wisent/core/agent/diagnose/__init__.py +1 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1155 -0
- wisent/core/agent/diagnose/response_diagnostics.py +273 -0
- wisent/core/agent/diagnose/select_classifiers.py +507 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +755 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1453 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose.py +249 -0
- wisent/core/agent/steer.py +215 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1158 -0
- wisent/core/benchmark_extractors.py +372 -0
- wisent/core/benchmark_registry.py +151 -0
- wisent/core/bigcode_extractors.py +26 -0
- wisent/core/bigcode_integration.py +886 -0
- wisent/core/branding.py +108 -0
- wisent/core/classifier/__init__.py +1 -0
- wisent/core/classifier/models/__init__.py +1 -0
- wisent/core/classifiers/__init__.py +1 -0
- wisent/core/classifiers/classifiers/__init__.py +0 -0
- wisent/core/classifiers/classifiers/core/__init__.py +0 -0
- wisent/core/classifiers/classifiers/core/atoms.py +748 -0
- wisent/core/classifiers/classifiers/models/__init__.py +0 -0
- wisent/core/classifiers/classifiers/models/logistic.py +29 -0
- wisent/core/classifiers/classifiers/models/mlp.py +47 -0
- wisent/core/classifiers/classifiers/rotator.py +137 -0
- wisent/core/classifiers/core/__init__.py +1 -0
- wisent/core/classifiers/models/__init__.py +1 -0
- wisent/core/classifiers/pipeline_steps/__init__.py +1 -0
- wisent/core/cli/__init__.py +26 -0
- wisent/core/cli/agent/__init__.py +15 -0
- wisent/core/cli/agent/apply_steering.py +192 -0
- wisent/core/cli/agent/evaluate_response.py +128 -0
- wisent/core/cli/agent/generate_synthetic_pairs.py +123 -0
- wisent/core/cli/agent/main.py +139 -0
- wisent/core/cli/agent/train_classifier.py +173 -0
- wisent/core/cli/check_linearity.py +126 -0
- wisent/core/cli/create_steering_vector.py +304 -0
- wisent/core/cli/diagnose_pairs.py +153 -0
- wisent/core/cli/diagnose_vectors.py +404 -0
- wisent/core/cli/estimate_unified_goodness_time.py +428 -0
- wisent/core/cli/evaluate_refusal.py +241 -0
- wisent/core/cli/evaluate_responses.py +926 -0
- wisent/core/cli/generate_humanization_pairs.py +128 -0
- wisent/core/cli/generate_pairs.py +175 -0
- wisent/core/cli/generate_pairs_from_task.py +108 -0
- wisent/core/cli/generate_responses.py +160 -0
- wisent/core/cli/generate_vector_from_synthetic.py +217 -0
- wisent/core/cli/generate_vector_from_task.py +248 -0
- wisent/core/cli/get_activations.py +192 -0
- wisent/core/cli/inference_config.py +84 -0
- wisent/core/cli/inference_config_cli.py +54 -0
- wisent/core/cli/modify_weights.py +660 -0
- wisent/core/cli/multi_steer.py +112 -0
- wisent/core/cli/optimization_cache.py +298 -0
- wisent/core/cli/optimize.py +621 -0
- wisent/core/cli/optimize_classification.py +473 -0
- wisent/core/cli/optimize_sample_size.py +390 -0
- wisent/core/cli/optimize_steering.py +3421 -0
- wisent/core/cli/optimize_weights.py +1287 -0
- wisent/core/cli/steering_method_trainer.py +641 -0
- wisent/core/cli/steering_search_space.py +508 -0
- wisent/core/cli/tasks.py +940 -0
- wisent/core/cli/train_unified_goodness.py +681 -0
- wisent/core/cli_logger.py +22 -0
- wisent/core/config_manager.py +1731 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +183 -0
- wisent/core/contrastive_pairs/core/response.py +153 -0
- wisent/core/contrastive_pairs/core/serialization.py +306 -0
- wisent/core/contrastive_pairs/core/set.py +192 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1655 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +118 -0
- wisent/core/contrastive_pairs/diagnostics/linearity.py +325 -0
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +620 -0
- wisent/core/contrastive_pairs/huggingface_pairs/__init__.py +1 -0
- wisent/core/contrastive_pairs/huggingface_pairs/atoms.py +255 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +470 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_registry.py +136 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +44 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentbench.py +225 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentharm.py +267 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +444 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +225 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime.py +118 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2024.py +74 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2025.py +73 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/alpaca_eval.py +153 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +182 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/arena_hard.py +179 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/atis.py +89 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/babilong.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bangla_mmlu.py +108 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/basqueglue.py +217 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bec2016eu.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bfcl.py +283 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bhtc_v2.py +87 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +245 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chain_of_thought.py +89 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chinese_simpleqa.py +209 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py +177 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +378 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +109 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +15 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +64 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +844 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coedit_gec.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/conala.py +133 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/concode.py +111 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/dbpedia_14.py +91 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/doc_vqa.py +102 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/donotanswer.py +236 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds1000.py +129 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds_1000.py +155 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/epec_koref_bin.py +85 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ethos_binary.py +82 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_mp.py +165 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_sp_sum_task_fp_small_p1.py +89 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/facts_grounding.py +181 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +295 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/financial_tweets.py +100 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +270 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flan_held_in.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +572 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +143 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_negative_example_livecodebench.py +146 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_positive_example_livecodebench.py +140 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/gpt3_translation_benchmarks.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +389 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py +246 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/harmbench.py +250 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/healthbench.py +181 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hle.py +106 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hmmt.py +117 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +119 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humanevalpack.py +102 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +180 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +129 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_ar_en.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_en_ar.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/jailbreakbench.py +258 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/law_stack_exchange.py +101 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ledgar.py +118 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench.py +61 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_contrastive_pair_generator.py +491 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_v6.py +263 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +230 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/llama.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +285 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/m_mmlu.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math.py +186 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +146 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +142 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/meddialog.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medical_abstracts.py +101 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +787 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +111 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlu_redux.py +194 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlusr.py +108 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multimedqa.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multipl_e.py +109 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_choice.py +87 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_cpp.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_go.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_java.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_js.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_py.py +15 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_rs.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/non_greedy_robustness_agieval_aqua_rat.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +287 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/openllm.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/option_order_robustness_agieval_aqua_rat.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/or_bench.py +300 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/penn_treebank.py +80 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +317 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +467 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/prompt_robustness_agieval_aqua_rat.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/pythia.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +131 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +280 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/scicode.py +275 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/self_consistency.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +145 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sorry_bench.py +211 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/stsb.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1_seq2seq.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_t5_prompt.py +123 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_gpqa.py +106 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench.py +428 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench_verified.py +158 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sycophancy_eval.py +205 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/t0_eval.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tag.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +305 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tmlu.py +109 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +360 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +386 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/travelplanner.py +286 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/truthfulqa_generation.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/unfair_tos.py +83 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/vaxx_stance.py +86 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wiceu.py +85 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wikitext103.py +97 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wildguard.py +280 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_en_fr.py +97 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_fr_en.py +97 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_de_en.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_de.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_ro.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_ro_en.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt_ro_en_t5_prompt.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/xsum.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +265 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/__init__.py +472 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aclue.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acp.py +33 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acpbench.py +39 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/advanced_ai_risk.py +59 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aexams.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimgsm.py +10 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimmlu.py +10 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrixnli.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_adr.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afriqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afrisenti.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_belebele.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_flores.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_injongointent.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_mafand.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhaner.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhanews.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhapos.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_naijarc.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_nollysenti.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_ntrex.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_openai_mmlu.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_salt.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_sib.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_uhura_arc_easy.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_xlsum.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/agieval.py +33 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/anli.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arab_culture.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva.py +67 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva_light.py +67 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_complete.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_light.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabicmmlu.py +59 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aradice.py +36 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arc.py +61 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arithmetic.py +19 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/basque_bench.py +37 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbh.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbq.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/belebele.py +293 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bertaqa.py +25 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bigbench.py +300 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/blimp.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/careqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/catalan_bench.py +43 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ceval_valid.py +61 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/cmmlu.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/copal_id.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/crows_pairs.py +31 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/csatqa.py +15 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darija.py +29 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darijammlu.py +57 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/egymmlu.py +62 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/eus.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/evalita_mp.py +93 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/fld.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/flores.py +466 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/french_bench.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/galician_bench.py +41 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/glianorex.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/global_mmlu.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gpqa.py +27 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k_platinum.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/haerae.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/headqa.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hellaswag.py +39 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_ethics.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_math.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hrm8k.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/inverse.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/japanese_leaderboard.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/jsonschema_bench.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kbl.py +85 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kmmlu.py +281 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kobest.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kormedmcqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lambada.py +28 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/leaderboard.py +52 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/libra.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lingoly.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/longbench.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/m.py +43 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mastermind.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mathqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/med.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/meddialog.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/medqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mela.py +18 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/metabench.py +36 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mgsm.py +44 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/minerva_math.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mlqa.py +58 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu.py +70 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro_plus.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_prox.py +191 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlusr.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmmu.py +46 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/model_written_evals.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/multiblimp.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/non.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noreval.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noridiom.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nortruthfulqa.py +32 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nrk.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_arc_multilingual.py +10 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_hellaswag_multilingual.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_mmlu_multilingual.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_truthfulqa_multilingual.py +34 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/paloma.py +25 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pawsx.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/persona.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pile.py +31 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/polemo2.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/portuguese_bench.py +31 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/prompt.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qa4mre.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qasper.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ru.py +19 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ruler.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/score.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/scrolls.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/self_consistency.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/spanish_bench.py +38 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/storycloze.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/super_glue_t5_prompt.py +17 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tinyBenchmarks.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmlu.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmmluplus.py +80 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/translation.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa_multi.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/turkishmmlu.py +30 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unitxt.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unscramble.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/winogender.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmdp.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt14.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt16.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wsc273.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xcopa.py +21 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli.py +28 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli_eu.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xquad.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xstorycloze.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xwinograd.py +15 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +478 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +207 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +185 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +130 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +184 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py +98 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_cot.py +88 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_mc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ag.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams1.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams2.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +98 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +104 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +167 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +268 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +133 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_gen.py +101 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_mc.py +106 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/argument.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +122 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/assin.py +103 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_gen.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_mc.py +139 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py +133 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +181 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py +170 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq_seq2seq.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +150 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabreu.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_gen.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_mc.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +139 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +223 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +163 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +151 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +166 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +148 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +149 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py +83 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +162 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqcat.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py +158 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle_letters.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +221 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +174 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +157 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +142 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +194 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/escola.py +85 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +135 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethos.py +99 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +225 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +159 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +159 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +159 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +166 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_sp.py +109 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py +105 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +202 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_mc.py +98 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_perplexity.py +86 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galcola.py +109 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_gen.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_mc.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +141 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py +109 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +184 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm.py +108 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +225 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +191 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +179 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hle.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +192 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/iwslt2017.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_gen.py +224 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +120 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py +123 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_cot.py +88 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_mc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +160 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py +147 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +185 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +185 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual_stablelm.py +141 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +194 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logieval.py +82 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py +137 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +123 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +224 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mediqa_qa2019.py +123 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medtext.py +108 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +96 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meqsum.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +154 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py +122 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +172 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_cot.py +88 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_mc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py +145 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +189 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py +150 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py +151 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog_perplexity.py +97 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multilingual.py +106 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +173 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +157 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen.py +277 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +228 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +223 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py +105 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +135 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py +27 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +167 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +174 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +162 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +209 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +186 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph_perplexity.py +97 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py +205 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +154 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +246 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases_ca_va.py +82 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py +135 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_gen.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_mc.py +103 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/quac.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +170 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +177 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +157 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +131 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +209 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_gen.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_mc.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +250 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +154 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/superglue.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/supergpqa.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +179 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +181 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py +91 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py +149 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +130 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +120 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py +142 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_cot.py +104 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +102 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/twenty_newsgroups.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py +131 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +95 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +130 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +122 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py +146 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py +139 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt14.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt16.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +197 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +147 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +131 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/yahoo.py +108 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +56 -0
- wisent/core/data_loaders/__init__.py +235 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +99 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/huggingface_loader.py +153 -0
- wisent/core/data_loaders/loaders/lm_loader.py +494 -0
- wisent/core/data_loaders/loaders/lm_loader_special_cases.py +496 -0
- wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
- wisent/core/data_loaders/rotator.py +118 -0
- wisent/core/detection_handling.py +259 -0
- wisent/core/diversity_processors.py +193 -0
- wisent/core/download_full_benchmarks.py +1512 -0
- wisent/core/errors/__init__.py +203 -0
- wisent/core/errors/error_codes.py +763 -0
- wisent/core/errors/error_handler.py +134 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/__init__.py +42 -0
- wisent/core/evaluators/benchmark_specific/aime_evaluator.py +90 -0
- wisent/core/evaluators/benchmark_specific/coding/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/core/atoms.py +36 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +363 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py +67 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py +126 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py +18 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py +31 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile +31 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py +105 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py +143 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/entrypoint.py +121 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/recipes.py +60 -0
- wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +332 -0
- wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +81 -0
- wisent/core/evaluators/benchmark_specific/f1_evaluator.py +173 -0
- wisent/core/evaluators/benchmark_specific/generation_evaluator.py +488 -0
- wisent/core/evaluators/benchmark_specific/livemathbench_evaluator.py +393 -0
- wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +202 -0
- wisent/core/evaluators/benchmark_specific/math_evaluator.py +119 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/__init__.py +1 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/core.py +1640 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/extract_boxed.py +48 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/is_equiv.py +159 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/scripts.py +919 -0
- wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +175 -0
- wisent/core/evaluators/benchmark_specific/polymath_evaluator.py +114 -0
- wisent/core/evaluators/core/__init__.py +5 -0
- wisent/core/evaluators/core/atoms.py +166 -0
- wisent/core/evaluators/custom/__init__.py +20 -0
- wisent/core/evaluators/custom/custom_evaluator.py +382 -0
- wisent/core/evaluators/custom/examples/__init__.py +37 -0
- wisent/core/evaluators/custom/examples/desklib_detector.py +166 -0
- wisent/core/evaluators/custom/examples/gptzero.py +185 -0
- wisent/core/evaluators/custom/examples/humanization.py +79 -0
- wisent/core/evaluators/custom/examples/humanization_coherent.py +127 -0
- wisent/core/evaluators/custom/examples/roberta_detector.py +173 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +168 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/evaluators/personalization/__init__.py +12 -0
- wisent/core/evaluators/personalization/alignment.py +166 -0
- wisent/core/evaluators/personalization/coherence.py +325 -0
- wisent/core/evaluators/personalization/difference.py +73 -0
- wisent/core/evaluators/rotator.py +217 -0
- wisent/core/evaluators/steering_evaluators.py +386 -0
- wisent/core/evaluators/synthetic_evaluator.py +377 -0
- wisent/core/hyperparameter_optimizer.py +547 -0
- wisent/core/layer.py +17 -0
- wisent/core/lm_eval_harness_ground_truth.py +1431 -0
- wisent/core/main.py +101 -0
- wisent/core/managed_cached_benchmarks.py +609 -0
- wisent/core/mixed_benchmark_sampler.py +366 -0
- wisent/core/modalities/__init__.py +545 -0
- wisent/core/model_persistence.py +302 -0
- wisent/core/models/__init__.py +23 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +465 -0
- wisent/core/models/inference_config.py +127 -0
- wisent/core/models/wisent_model.py +893 -0
- wisent/core/multi_steering.py +397 -0
- wisent/core/opti/__init__.py +0 -0
- wisent/core/opti/core/__init__.py +0 -0
- wisent/core/opti/core/atoms.py +177 -0
- wisent/core/opti/methods/__init__.py +10 -0
- wisent/core/opti/methods/opti_classificator.py +172 -0
- wisent/core/opti/methods/opti_steering.py +139 -0
- wisent/core/opti/methods/opti_weights.py +523 -0
- wisent/core/optuna/__init__.py +54 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +351 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +685 -0
- wisent/core/optuna/steering/__init__.py +20 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +200 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +412 -0
- wisent/core/optuna/steering/steering_optimization.py +1096 -0
- wisent/core/parser.py +1662 -0
- wisent/core/parser_arguments/__init__.py +10 -0
- wisent/core/parser_arguments/agent_parser.py +122 -0
- wisent/core/parser_arguments/check_linearity_parser.py +82 -0
- wisent/core/parser_arguments/configure_model_parser.py +7 -0
- wisent/core/parser_arguments/create_steering_vector_parser.py +67 -0
- wisent/core/parser_arguments/diagnose_pairs_parser.py +25 -0
- wisent/core/parser_arguments/diagnose_vectors_parser.py +72 -0
- wisent/core/parser_arguments/evaluate_parser.py +40 -0
- wisent/core/parser_arguments/evaluate_refusal_parser.py +32 -0
- wisent/core/parser_arguments/evaluate_responses_parser.py +12 -0
- wisent/core/parser_arguments/full_optimize_parser.py +194 -0
- wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
- wisent/core/parser_arguments/generate_pairs_parser.py +43 -0
- wisent/core/parser_arguments/generate_responses_parser.py +16 -0
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +148 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +149 -0
- wisent/core/parser_arguments/generate_vector_parser.py +89 -0
- wisent/core/parser_arguments/get_activations_parser.py +90 -0
- wisent/core/parser_arguments/inference_config_parser.py +65 -0
- wisent/core/parser_arguments/main_parser.py +220 -0
- wisent/core/parser_arguments/model_config_parser.py +59 -0
- wisent/core/parser_arguments/modify_weights_parser.py +309 -0
- wisent/core/parser_arguments/monitor_parser.py +17 -0
- wisent/core/parser_arguments/multi_steer_parser.py +48 -0
- wisent/core/parser_arguments/nonsense_parser.py +26 -0
- wisent/core/parser_arguments/optimization_cache_parser.py +64 -0
- wisent/core/parser_arguments/optimize_classification_parser.py +108 -0
- wisent/core/parser_arguments/optimize_parser.py +142 -0
- wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +617 -0
- wisent/core/parser_arguments/optimize_weights_parser.py +403 -0
- wisent/core/parser_arguments/synthetic_parser.py +117 -0
- wisent/core/parser_arguments/tasks_parser.py +591 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +172 -0
- wisent/core/parser_arguments/utils.py +107 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +148 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +26 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +26 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +31 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +33 -0
- wisent/core/representation.py +5 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +660 -0
- wisent/core/steering_method.py +20 -0
- wisent/core/steering_methods/__init__.py +54 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +154 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +45 -0
- wisent/core/steering_methods/methods/prism.py +588 -0
- wisent/core/steering_methods/methods/pulse.py +641 -0
- wisent/core/steering_methods/methods/titan.py +1005 -0
- wisent/core/steering_methods/preflight.py +322 -0
- wisent/core/steering_methods/registry.py +649 -0
- wisent/core/steering_methods/rotator.py +121 -0
- wisent/core/steering_optimizer.py +1503 -0
- wisent/core/synthetic/__init__.py +0 -0
- wisent/core/synthetic/cleaners/__init__.py +0 -0
- wisent/core/synthetic/cleaners/core/__init__.py +0 -0
- wisent/core/synthetic/cleaners/core/atoms.py +58 -0
- wisent/core/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/core/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/core/synthetic/cleaners/methods/base_dedupers.py +321 -0
- wisent/core/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/core/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/core/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/core/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/core/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/core/synthetic/db_instructions/__init__.py +0 -0
- wisent/core/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/core/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/core/synthetic/db_instructions/mini_dp.py +115 -0
- wisent/core/synthetic/generators/__init__.py +0 -0
- wisent/core/synthetic/generators/core/__init__.py +0 -0
- wisent/core/synthetic/generators/core/atoms.py +73 -0
- wisent/core/synthetic/generators/diversities/__init__.py +0 -0
- wisent/core/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/core/synthetic/generators/diversities/core/core.py +68 -0
- wisent/core/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/core/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/core/synthetic/generators/nonsense_generator.py +150 -0
- wisent/core/synthetic/generators/pairs_generator.py +313 -0
- wisent/core/task_interface.py +143 -0
- wisent/core/task_selector.py +232 -0
- wisent/core/tasks/__init__.py +218 -0
- wisent/core/tasks/aime_task.py +142 -0
- wisent/core/tasks/file_task.py +212 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +120 -0
- wisent/core/tasks/livecodebench_task.py +94 -0
- wisent/core/tasks/livemathbench_task.py +159 -0
- wisent/core/tasks/lm_eval_task.py +611 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +147 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +155 -0
- wisent/core/timing_calibration.py +176 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +620 -0
- wisent/core/tracking/memory.py +360 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +365 -0
- wisent/core/universal_subspace.py +918 -0
- wisent/core/user_model_config.py +158 -0
- wisent/core/utils/__init__.py +64 -0
- wisent/core/utils/base_rotator.py +292 -0
- wisent/core/utils/dataset_splits.py +197 -0
- wisent/core/utils/device.py +279 -0
- wisent/core/weight_modification/__init__.py +134 -0
- wisent/core/weight_modification/additive.py +340 -0
- wisent/core/weight_modification/directional.py +1357 -0
- wisent/core/weight_modification/export.py +359 -0
- wisent/core/weight_modification/multi_direction.py +410 -0
- wisent/core/weight_modification/utils.py +236 -0
- wisent/core/wisent.py +660 -0
- wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +2112 -0
- wisent/examples/scripts/1/test_basqueglue_evaluation.json +51 -0
- wisent/examples/scripts/1/test_basqueglue_pairs.json +14 -0
- wisent/examples/scripts/1/test_bec2016eu_evaluation.json +51 -0
- wisent/examples/scripts/1/test_bec2016eu_pairs.json +14 -0
- wisent/examples/scripts/1/test_belebele_evaluation.json +51 -0
- wisent/examples/scripts/1/test_belebele_pairs.json +14 -0
- wisent/examples/scripts/1/test_benchmarks_evaluation.json +51 -0
- wisent/examples/scripts/1/test_benchmarks_pairs.json +14 -0
- wisent/examples/scripts/1/test_bertaqa_evaluation.json +51 -0
- wisent/examples/scripts/1/test_bertaqa_pairs.json +14 -0
- wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +30 -0
- wisent/examples/scripts/1/test_bhtc_v2_pairs.json +8 -0
- wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +30 -0
- wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +8 -0
- wisent/examples/scripts/1/test_cabreu_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cabreu_pairs.json +8 -0
- wisent/examples/scripts/1/test_careqa_en_evaluation.json +30 -0
- wisent/examples/scripts/1/test_careqa_en_pairs.json +8 -0
- wisent/examples/scripts/1/test_careqa_evaluation.json +30 -0
- wisent/examples/scripts/1/test_careqa_pairs.json +8 -0
- wisent/examples/scripts/1/test_catalanqa_evaluation.json +30 -0
- wisent/examples/scripts/1/test_catalanqa_pairs.json +8 -0
- wisent/examples/scripts/1/test_catcola_evaluation.json +30 -0
- wisent/examples/scripts/1/test_catcola_pairs.json +8 -0
- wisent/examples/scripts/1/test_chartqa_evaluation.json +30 -0
- wisent/examples/scripts/1/test_chartqa_pairs.json +8 -0
- wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +30 -0
- wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +8 -0
- wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +8 -0
- wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cocoteros_es_pairs.json +8 -0
- wisent/examples/scripts/1/test_coedit_gec_evaluation.json +30 -0
- wisent/examples/scripts/1/test_coedit_gec_pairs.json +8 -0
- wisent/examples/scripts/1/test_cola_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cola_pairs.json +8 -0
- wisent/examples/scripts/1/test_coqcat_evaluation.json +30 -0
- wisent/examples/scripts/1/test_coqcat_pairs.json +8 -0
- wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +30 -0
- wisent/examples/scripts/1/test_dbpedia_14_pairs.json +8 -0
- wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +30 -0
- wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +8 -0
- wisent/examples/scripts/1/test_ethos_binary_evaluation.json +30 -0
- wisent/examples/scripts/1/test_ethos_binary_pairs.json +8 -0
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/2/test_arc_ar_evaluation.json +30 -0
- wisent/examples/scripts/2/test_arc_ar_pairs.json +8 -0
- wisent/examples/scripts/2/test_atis_evaluation.json +30 -0
- wisent/examples/scripts/2/test_atis_pairs.json +8 -0
- wisent/examples/scripts/2/test_babi_evaluation.json +30 -0
- wisent/examples/scripts/2/test_babi_pairs.json +8 -0
- wisent/examples/scripts/2/test_babilong_evaluation.json +30 -0
- wisent/examples/scripts/2/test_babilong_pairs.json +8 -0
- wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +30 -0
- wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +8 -0
- wisent/examples/scripts/2/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/benchmark_tags.json +2140 -0
- wisent/examples/scripts/lm_eval_readme.json +4 -0
- wisent/examples/scripts/results/benchmark_descriptions.json +1244 -0
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +66 -0
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +2781 -0
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +30536 -0
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +469 -0
- wisent/examples/scripts/results/benchmark_methods_summary.json +260 -0
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +66 -0
- wisent/examples/scripts/results/benchmark_pair_totals.json +269 -0
- wisent/examples/scripts/results/benchmark_tags.json +917 -0
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +71 -0
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +150 -0
- wisent/examples/scripts/results/failing_benchmarks.json +946 -0
- wisent/examples/scripts/results/failing_benchmarks_list.json +41 -0
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +945 -0
- wisent/examples/scripts/results/missing_benchmark_tags.json +341 -0
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +30 -0
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +8 -0
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +51 -0
- wisent/examples/scripts/results/test_AraDICE_pairs.json +14 -0
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +8 -0
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +14 -0
- wisent/examples/scripts/results/test_Tag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_Tag_pairs.json +8 -0
- wisent/examples/scripts/results/test_aclue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aclue_pairs.json +14 -0
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +51 -0
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +14 -0
- wisent/examples/scripts/results/test_acp_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +51 -0
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +14 -0
- wisent/examples/scripts/results/test_aexams_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aexams_pairs.json +14 -0
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/results/test_ag_news_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ag_news_pairs.json +8 -0
- wisent/examples/scripts/results/test_agieval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_agieval_pairs.json +14 -0
- wisent/examples/scripts/results/test_aime2024_evaluation.json +30 -0
- wisent/examples/scripts/results/test_aime2024_pairs.json +8 -0
- wisent/examples/scripts/results/test_aime2025_evaluation.json +30 -0
- wisent/examples/scripts/results/test_aime2025_pairs.json +8 -0
- wisent/examples/scripts/results/test_aime_evaluation.json +30 -0
- wisent/examples/scripts/results/test_aime_pairs.json +8 -0
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_anagrams1_pairs.json +8 -0
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_anagrams2_pairs.json +8 -0
- wisent/examples/scripts/results/test_anli_evaluation.json +30 -0
- wisent/examples/scripts/results/test_anli_pairs.json +8 -0
- wisent/examples/scripts/results/test_apps_evaluation.json +30 -0
- wisent/examples/scripts/results/test_apps_pairs.json +8 -0
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +8 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +14 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +14 -0
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +14 -0
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +14 -0
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arc_ar_pairs.json +8 -0
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +8 -0
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arc_easy_pairs.json +8 -0
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +30 -0
- wisent/examples/scripts/results/test_argument_topic_pairs.json +8 -0
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arithmetic_pairs.json +14 -0
- wisent/examples/scripts/results/test_asdiv_evaluation.json +30 -0
- wisent/examples/scripts/results/test_asdiv_pairs.json +8 -0
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +30 -0
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +8 -0
- wisent/examples/scripts/results/test_atis_evaluation.json +30 -0
- wisent/examples/scripts/results/test_atis_pairs.json +8 -0
- wisent/examples/scripts/results/test_babi_evaluation.json +30 -0
- wisent/examples/scripts/results/test_babi_pairs.json +8 -0
- wisent/examples/scripts/results/test_babilong_evaluation.json +30 -0
- wisent/examples/scripts/results/test_babilong_pairs.json +8 -0
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +30 -0
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +8 -0
- wisent/examples/scripts/results/test_banking77_evaluation.json +30 -0
- wisent/examples/scripts/results/test_banking77_pairs.json +8 -0
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basqueglue_pairs.json +14 -0
- wisent/examples/scripts/results/test_bbh_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bbh_pairs.json +14 -0
- wisent/examples/scripts/results/test_bbq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_bbq_pairs.json +8 -0
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +14 -0
- wisent/examples/scripts/results/test_belebele_evaluation.json +51 -0
- wisent/examples/scripts/results/test_belebele_pairs.json +14 -0
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +51 -0
- wisent/examples/scripts/results/test_benchmarks_pairs.json +14 -0
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bertaqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +8 -0
- wisent/examples/scripts/results/test_bigbench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bigbench_pairs.json +14 -0
- wisent/examples/scripts/results/test_blimp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_blimp_pairs.json +14 -0
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +8 -0
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +8 -0
- wisent/examples/scripts/results/test_boolq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_boolq_pairs.json +8 -0
- wisent/examples/scripts/results/test_c4_evaluation.json +30 -0
- wisent/examples/scripts/results/test_c4_pairs.json +8 -0
- wisent/examples/scripts/results/test_cabreu_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cabreu_pairs.json +8 -0
- wisent/examples/scripts/results/test_careqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_careqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_catalanqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_catcola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_catcola_pairs.json +8 -0
- wisent/examples/scripts/results/test_cb_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cb_pairs.json +8 -0
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +14 -0
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +8 -0
- wisent/examples/scripts/results/test_ceval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ceval_pairs.json +14 -0
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +14 -0
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +51 -0
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +14 -0
- wisent/examples/scripts/results/test_chartqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_chartqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +30 -0
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +8 -0
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_cmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +8 -0
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +8 -0
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +30 -0
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +8 -0
- wisent/examples/scripts/results/test_cola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cola_pairs.json +8 -0
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +8 -0
- wisent/examples/scripts/results/test_conala_evaluation.json +30 -0
- wisent/examples/scripts/results/test_conala_pairs.json +8 -0
- wisent/examples/scripts/results/test_concode_evaluation.json +30 -0
- wisent/examples/scripts/results/test_concode_pairs.json +8 -0
- wisent/examples/scripts/results/test_copa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_copa_pairs.json +8 -0
- wisent/examples/scripts/results/test_copal_id_evaluation.json +30 -0
- wisent/examples/scripts/results/test_copal_id_pairs.json +8 -0
- wisent/examples/scripts/results/test_coqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_coqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_coqcat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_coqcat_pairs.json +8 -0
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +51 -0
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +14 -0
- wisent/examples/scripts/results/test_csatqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_csatqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +8 -0
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_darija_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +8 -0
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_darijammlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +30 -0
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +8 -0
- wisent/examples/scripts/results/test_drop_evaluation.json +30 -0
- wisent/examples/scripts/results/test_drop_pairs.json +8 -0
- wisent/examples/scripts/results/test_ds1000_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ds1000_pairs.json +8 -0
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +8 -0
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_egymmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +30 -0
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +8 -0
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eq_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_escola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_escola_pairs.json +8 -0
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +8 -0
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +8 -0
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +51 -0
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +14 -0
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +51 -0
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +14 -0
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +51 -0
- wisent/examples/scripts/results/test_eus_exams_pairs.json +14 -0
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +8 -0
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eus_reading_pairs.json +8 -0
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +8 -0
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +8 -0
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
- wisent/examples/scripts/results/test_fda_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fda_pairs.json +8 -0
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +30 -0
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +8 -0
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +8 -0
- wisent/examples/scripts/results/test_fld_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +8 -0
- wisent/examples/scripts/results/test_fld_pairs.json +8 -0
- wisent/examples/scripts/results/test_flores_evaluation.json +51 -0
- wisent/examples/scripts/results/test_flores_pairs.json +14 -0
- wisent/examples/scripts/results/test_freebase_evaluation.json +30 -0
- wisent/examples/scripts/results/test_freebase_pairs.json +8 -0
- wisent/examples/scripts/results/test_french_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_french_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_galcola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_galcola_pairs.json +8 -0
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_galician_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_glianorex_evaluation.json +30 -0
- wisent/examples/scripts/results/test_glianorex_pairs.json +8 -0
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_gpqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_gpqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +51 -0
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +14 -0
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +8 -0
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +30 -0
- wisent/examples/scripts/results/test_gsm8k_pairs.json +8 -0
- wisent/examples/scripts/results/test_haerae_evaluation.json +51 -0
- wisent/examples/scripts/results/test_haerae_pairs.json +14 -0
- wisent/examples/scripts/results/test_headqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_headqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_hellaswag_pairs.json +8 -0
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +51 -0
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +14 -0
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +51 -0
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +14 -0
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +30 -0
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +8 -0
- wisent/examples/scripts/results/test_hmmt_evaluation.json +30 -0
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +30 -0
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +8 -0
- wisent/examples/scripts/results/test_hmmt_pairs.json +8 -0
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +51 -0
- wisent/examples/scripts/results/test_hrm8k_pairs.json +14 -0
- wisent/examples/scripts/results/test_humaneval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_humaneval_pairs.json +8 -0
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +30 -0
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +8 -0
- wisent/examples/scripts/results/test_ifeval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ifeval_pairs.json +8 -0
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +8 -0
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +8 -0
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +51 -0
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +30 -0
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +8 -0
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +51 -0
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +14 -0
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +14 -0
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +8 -0
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +51 -0
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +14 -0
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_kbl_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +14 -0
- wisent/examples/scripts/results/test_kbl_pairs.json +14 -0
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_kobest_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kobest_pairs.json +14 -0
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +30 -0
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +8 -0
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +51 -0
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +14 -0
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +51 -0
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +14 -0
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +14 -0
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +8 -0
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +51 -0
- wisent/examples/scripts/results/test_leaderboard_pairs.json +14 -0
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +51 -0
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +14 -0
- wisent/examples/scripts/results/test_libra_evaluation.json +51 -0
- wisent/examples/scripts/results/test_libra_pairs.json +14 -0
- wisent/examples/scripts/results/test_lingoly_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lingoly_pairs.json +8 -0
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_livecodebench_pairs.json +8 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +8 -0
- wisent/examples/scripts/results/test_llama_evaluation.json +30 -0
- wisent/examples/scripts/results/test_llama_pairs.json +8 -0
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_logiqa2_pairs.json +8 -0
- wisent/examples/scripts/results/test_logiqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_logiqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +14 -0
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +8 -0
- wisent/examples/scripts/results/test_mastermind_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mastermind_pairs.json +14 -0
- wisent/examples/scripts/results/test_math500_evaluation.json +30 -0
- wisent/examples/scripts/results/test_math500_pairs.json +8 -0
- wisent/examples/scripts/results/test_math_evaluation.json +30 -0
- wisent/examples/scripts/results/test_math_pairs.json +8 -0
- wisent/examples/scripts/results/test_mathqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mathqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_mbpp_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mbpp_pairs.json +8 -0
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +8 -0
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mc_taco_pairs.json +8 -0
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +14 -0
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +8 -0
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +14 -0
- wisent/examples/scripts/results/test_meddialog_evaluation.json +30 -0
- wisent/examples/scripts/results/test_meddialog_pairs.json +8 -0
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +30 -0
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +8 -0
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +8 -0
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_medmcqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_medqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_medqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_medtext_evaluation.json +30 -0
- wisent/examples/scripts/results/test_medtext_pairs.json +8 -0
- wisent/examples/scripts/results/test_mela_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mela_pairs.json +14 -0
- wisent/examples/scripts/results/test_meqsum_evaluation.json +30 -0
- wisent/examples/scripts/results/test_meqsum_pairs.json +8 -0
- wisent/examples/scripts/results/test_mercury_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mercury_pairs.json +8 -0
- wisent/examples/scripts/results/test_metabench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_metabench_pairs.json +14 -0
- wisent/examples/scripts/results/test_mgsm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mgsm_pairs.json +14 -0
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +8 -0
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +51 -0
- wisent/examples/scripts/results/test_minerva_math_pairs.json +14 -0
- wisent/examples/scripts/results/test_mlqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mlqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mmlusr_pairs.json +8 -0
- wisent/examples/scripts/results/test_mmmu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmmu_pairs.json +14 -0
- wisent/examples/scripts/results/test_mnli_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mnli_pairs.json +8 -0
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +51 -0
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +14 -0
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +30 -0
- wisent/examples/scripts/results/test_moral_stories_pairs.json +8 -0
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +8 -0
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_multiblimp_pairs.json +14 -0
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_multimedqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +30 -0
- wisent/examples/scripts/results/test_multipl_e_pairs.json +8 -0
- wisent/examples/scripts/results/test_mutual_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mutual_pairs.json +8 -0
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_noreval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_noreval_pairs.json +14 -0
- wisent/examples/scripts/results/test_noticia_evaluation.json +30 -0
- wisent/examples/scripts/results/test_noticia_pairs.json +8 -0
- wisent/examples/scripts/results/test_nq_open_evaluation.json +30 -0
- wisent/examples/scripts/results/test_nq_open_pairs.json +8 -0
- wisent/examples/scripts/results/test_olaph_evaluation.json +30 -0
- wisent/examples/scripts/results/test_olaph_pairs.json +8 -0
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_openbookqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_openllm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_openllm_pairs.json +14 -0
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_paloma_evaluation.json +51 -0
- wisent/examples/scripts/results/test_paloma_pairs.json +14 -0
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +30 -0
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +8 -0
- wisent/examples/scripts/results/test_paws-x_evaluation.json +51 -0
- wisent/examples/scripts/results/test_paws-x_pairs.json +14 -0
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +30 -0
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +8 -0
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +30 -0
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +8 -0
- wisent/examples/scripts/results/test_piqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_piqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_polemo2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polemo2_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +8 -0
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_prost_evaluation.json +30 -0
- wisent/examples/scripts/results/test_prost_pairs.json +8 -0
- wisent/examples/scripts/results/test_ptb_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ptb_pairs.json +8 -0
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_pythia_evaluation.json +51 -0
- wisent/examples/scripts/results/test_pythia_pairs.json +14 -0
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +30 -0
- wisent/examples/scripts/results/test_qa4mre_pairs.json +8 -0
- wisent/examples/scripts/results/test_qasper_evaluation.json +30 -0
- wisent/examples/scripts/results/test_qasper_pairs.json +8 -0
- wisent/examples/scripts/results/test_race_evaluation.json +30 -0
- wisent/examples/scripts/results/test_race_pairs.json +8 -0
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +30 -0
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +8 -0
- wisent/examples/scripts/results/test_recode_evaluation.json +30 -0
- wisent/examples/scripts/results/test_recode_pairs.json +8 -0
- wisent/examples/scripts/results/test_record_evaluation.json +30 -0
- wisent/examples/scripts/results/test_record_pairs.json +8 -0
- wisent/examples/scripts/results/test_ruler_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ruler_pairs.json +14 -0
- wisent/examples/scripts/results/test_sciq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_sciq_pairs.json +8 -0
- wisent/examples/scripts/results/test_score_evaluation.json +51 -0
- wisent/examples/scripts/results/test_score_pairs.json +14 -0
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +30 -0
- wisent/examples/scripts/results/test_self_consistency_pairs.json +8 -0
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_siqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_siqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_squad2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_squad2_pairs.json +8 -0
- wisent/examples/scripts/results/test_squadv2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_squadv2_pairs.json +8 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +8 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +51 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +14 -0
- wisent/examples/scripts/results/test_swag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_swag_pairs.json +8 -0
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +51 -0
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +14 -0
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +51 -0
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +14 -0
- wisent/examples/scripts/results/test_translation_evaluation.json +51 -0
- wisent/examples/scripts/results/test_translation_pairs.json +14 -0
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_triviaqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +51 -0
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +14 -0
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +8 -0
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +8 -0
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +30 -0
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +8 -0
- wisent/examples/scripts/results/test_unscramble_evaluation.json +51 -0
- wisent/examples/scripts/results/test_unscramble_pairs.json +14 -0
- wisent/examples/scripts/results/test_webqs_evaluation.json +30 -0
- wisent/examples/scripts/results/test_webqs_pairs.json +8 -0
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wikitext103_pairs.json +8 -0
- wisent/examples/scripts/results/test_wikitext_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wikitext_pairs.json +8 -0
- wisent/examples/scripts/results/test_winogender_evaluation.json +51 -0
- wisent/examples/scripts/results/test_winogender_pairs.json +14 -0
- wisent/examples/scripts/results/test_winogrande_evaluation.json +30 -0
- wisent/examples/scripts/results/test_winogrande_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmdp_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmdp_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_wsc273_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wsc273_pairs.json +8 -0
- wisent/examples/scripts/results/test_xcopa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xcopa_pairs.json +14 -0
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +30 -0
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +8 -0
- wisent/examples/scripts/results/test_xnli_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xnli_pairs.json +14 -0
- wisent/examples/scripts/results/test_xquad_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xquad_pairs.json +14 -0
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +14 -0
- wisent/examples/scripts/results/test_xsum_evaluation.json +30 -0
- wisent/examples/scripts/results/test_xsum_pairs.json +8 -0
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xwinograd_pairs.json +14 -0
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +30 -0
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +8 -0
- wisent/parameters/__init__.py +1 -0
- wisent/parameters/lm_eval/all_lm_eval_task_families.json +169 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +10 -0
- wisent/parameters/lm_eval/evaluations_not_lm_eval_tasks.json +0 -0
- wisent/parameters/lm_eval/evaluator_check.json +3476 -0
- wisent/parameters/lm_eval/final_verification.json +24782 -0
- wisent/parameters/lm_eval/group_task_evaluators.json +1833 -0
- wisent/parameters/lm_eval/group_tasks.json +150 -0
- wisent/parameters/lm_eval/individual_tasks.json +402 -0
- wisent/parameters/lm_eval/no_readmes.json +1 -0
- wisent/parameters/lm_eval/not_lm_eval_tasks.json +110 -0
- wisent/parameters/lm_eval/read_tasks.json +208 -0
- wisent/parameters/lm_eval/readme_files.json +208 -0
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +128 -0
- wisent/parameters/tasks/missing_task_families.json +2963 -0
- wisent/parameters/tasks/remaining_tasks_to_implement.json +199 -0
- wisent/parameters/tasks/risks.json +10 -0
- wisent/parameters/tasks/skills.json +14 -0
- wisent/parameters/tasks/tasks.json +56031 -0
- wisent/scripts/run_quality_metrics_sweep.sh +315 -0
- wisent/tests/__init__.py +0 -0
- wisent/tests/examples/__init__.py +0 -0
- wisent/tests/examples/cli/__init__.py +0 -0
- wisent/tests/examples/cli/activations/__init__.py +0 -0
- wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
- wisent/tests/examples/cli/classifier/__init__.py +0 -0
- wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
- wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
- wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
- wisent/tests/examples/cli/evaluation/__init__.py +0 -0
- wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
- wisent/tests/examples/cli/generate/__init__.py +0 -0
- wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
- wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
- wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
- wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
- wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
- wisent/tests/examples/cli/optimizer/__init__.py +0 -0
- wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
- wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
- wisent/tests/examples/cli/steering/__init__.py +0 -0
- wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
- wisent/tests/examples/cli/synthetic/__init__.py +0 -0
- wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
- wisent/tests/nosense/__init__.py +6 -0
- wisent/tests/nosense/base_nosense.py +81 -0
- wisent/tests/nosense/math500_nosense.py +72 -0
- wisent/tests/nosense/test_robustness.py +336 -0
- wisent/tests/test_all_cli_commands.py +674 -0
- wisent/tests/test_geometry_comprehensive.py +327 -0
- wisent/tests/test_titan_geometry.py +257 -0
- wisent/tests/visualize_geometry.py +148 -0
- wisent-0.7.379.dist-info/METADATA +64 -0
- wisent-0.7.379.dist-info/RECORD +1720 -0
- wisent-0.7.379.dist-info/WHEEL +5 -0
- wisent-0.7.379.dist-info/entry_points.txt +2 -0
- wisent-0.7.379.dist-info/licenses/LICENSE +21 -0
- wisent-0.7.379.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any, TYPE_CHECKING
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
# Configure TensorFlow threading BEFORE any TensorFlow import
|
|
7
|
+
# TensorFlow (used by BLEURT metric in meddialog and other tasks) can deadlock during model loading
|
|
8
|
+
# when using default threading settings. Limit threads to prevent deadlock.
|
|
9
|
+
os.environ['TF_NUM_INTEROP_THREADS'] = '1'
|
|
10
|
+
os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
|
|
11
|
+
os.environ['OMP_NUM_THREADS'] = '1'
|
|
12
|
+
|
|
13
|
+
# Enable trust_remote_code for all datasets (required for meddialog and others)
|
|
14
|
+
# This uses lm-eval's recommended approach from PR #1998
|
|
15
|
+
import datasets.config
|
|
16
|
+
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
|
|
17
|
+
|
|
18
|
+
# Patch deprecated 'List' feature type (datasets v3.6.0+)
|
|
19
|
+
# Many older datasets use 'List' which was replaced by 'LargeList'
|
|
20
|
+
import datasets.features.features as _features_module
|
|
21
|
+
if 'List' not in _features_module._FEATURE_TYPES and 'LargeList' in _features_module._FEATURE_TYPES:
|
|
22
|
+
_features_module._FEATURE_TYPES['List'] = _features_module._FEATURE_TYPES['LargeList']
|
|
23
|
+
|
|
24
|
+
from wisent.core.data_loaders.core.atoms import BaseDataLoader, DataLoaderError, LoadDataResult
|
|
25
|
+
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
26
|
+
from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
27
|
+
from lm_eval.tasks import get_task_dict
|
|
28
|
+
from lm_eval.tasks import TaskManager as LMTaskManager
|
|
29
|
+
from wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_pairs_generation import (
|
|
30
|
+
lm_build_contrastive_pairs,
|
|
31
|
+
)
|
|
32
|
+
from wisent.core.data_loaders.loaders.lm_loader_special_cases import get_special_case_handler
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from lm_eval.api.task import ConfigurableTask
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"LMEvalDataLoader",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
log = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class LMEvalDataLoader(BaseDataLoader):
|
|
45
|
+
"""
|
|
46
|
+
Load contrastive pairs from a single lm-evaluation-harness task via `load_lm_eval_task`,
|
|
47
|
+
split into train/test, and return a canonical LoadDataResult.
|
|
48
|
+
"""
|
|
49
|
+
name = "lm_eval"
|
|
50
|
+
description = "Load from a single lm-eval task."
|
|
51
|
+
|
|
52
|
+
# Tasks that are HuggingFace-only (not in lm-eval-harness)
|
|
53
|
+
# Loaded from central benchmark_registry
|
|
54
|
+
_huggingface_only_tasks_cache = None
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def _get_huggingface_only_tasks(cls):
|
|
58
|
+
"""Get the set of HuggingFace-only tasks from central registry."""
|
|
59
|
+
if cls._huggingface_only_tasks_cache is None:
|
|
60
|
+
from wisent.core.benchmark_registry import get_huggingface_only_tasks_set
|
|
61
|
+
cls._huggingface_only_tasks_cache = get_huggingface_only_tasks_set()
|
|
62
|
+
return cls._huggingface_only_tasks_cache
|
|
63
|
+
|
|
64
|
+
def _load_one_task(
|
|
65
|
+
self,
|
|
66
|
+
task_name: str,
|
|
67
|
+
split_ratio: float,
|
|
68
|
+
seed: int,
|
|
69
|
+
limit: int | None,
|
|
70
|
+
training_limit: int | None,
|
|
71
|
+
testing_limit: int | None,
|
|
72
|
+
) -> LoadDataResult:
|
|
73
|
+
"""
|
|
74
|
+
Load a single lm-eval task by name, convert to contrastive pairs,
|
|
75
|
+
split into train/test, and return a LoadDataResult.
|
|
76
|
+
|
|
77
|
+
arguments:
|
|
78
|
+
task_name: The name of the lm-eval task to load.
|
|
79
|
+
split_ratio: The fraction of data to use for training (between 0 and 1).
|
|
80
|
+
seed: Random seed for shuffling/splitting.
|
|
81
|
+
limit: Optional limit on total number of pairs to load.
|
|
82
|
+
training_limit: Optional limit on number of training pairs.
|
|
83
|
+
testing_limit: Optional limit on number of testing pairs.
|
|
84
|
+
|
|
85
|
+
returns:
|
|
86
|
+
A LoadDataResult containing train/test pairs and task info.
|
|
87
|
+
|
|
88
|
+
raises:
|
|
89
|
+
DataLoaderError if the task cannot be found or if splits are empty.
|
|
90
|
+
ValueError if split_ratio is not in [0.0, 1.0].
|
|
91
|
+
NotImplementedError if load_lm_eval_task is not implemented.
|
|
92
|
+
|
|
93
|
+
note:
|
|
94
|
+
This loader supports both single tasks and group tasks. For group tasks,
|
|
95
|
+
it loads all subtasks and combines their pairs."""
|
|
96
|
+
|
|
97
|
+
# Check if this is a HuggingFace-only task (no lm-eval support)
|
|
98
|
+
task_name_lower = task_name.lower()
|
|
99
|
+
if task_name_lower in self._get_huggingface_only_tasks():
|
|
100
|
+
log.info(f"Task '{task_name}' is a HuggingFace-only task, loading via HuggingFace extractor")
|
|
101
|
+
pairs = lm_build_contrastive_pairs(
|
|
102
|
+
task_name=task_name,
|
|
103
|
+
lm_eval_task=None, # HuggingFace extractors don't need lm-eval task
|
|
104
|
+
limit=limit,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
train_pairs, test_pairs = self._split_pairs(
|
|
108
|
+
pairs, split_ratio, seed, training_limit, testing_limit
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if not train_pairs or not test_pairs:
|
|
112
|
+
raise DataLoaderError("One of the splits is empty after splitting.")
|
|
113
|
+
|
|
114
|
+
train_set = ContrastivePairSet("lm_eval_train", train_pairs, task_type=task_name)
|
|
115
|
+
test_set = ContrastivePairSet("lm_eval_test", test_pairs, task_type=task_name)
|
|
116
|
+
|
|
117
|
+
train_set.validate(raise_on_critical=False)
|
|
118
|
+
test_set.validate(raise_on_critical=False)
|
|
119
|
+
|
|
120
|
+
return LoadDataResult(
|
|
121
|
+
train_qa_pairs=train_set,
|
|
122
|
+
test_qa_pairs=test_set,
|
|
123
|
+
task_type=task_name,
|
|
124
|
+
lm_task_data=None,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
loaded = self.load_lm_eval_task(task_name)
|
|
128
|
+
|
|
129
|
+
if isinstance(loaded, dict):
|
|
130
|
+
if len(loaded) == 1:
|
|
131
|
+
|
|
132
|
+
# Single subtask
|
|
133
|
+
(subname, task_obj), = loaded.items()
|
|
134
|
+
pairs = lm_build_contrastive_pairs(
|
|
135
|
+
task_name=subname,
|
|
136
|
+
lm_eval_task=task_obj,
|
|
137
|
+
limit=limit,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
|
|
141
|
+
# Group task with multiple subtasks - load all and combine
|
|
142
|
+
log.info(f"Task '{task_name}' is a group task with {len(loaded)} subtasks. Loading all subtasks...")
|
|
143
|
+
|
|
144
|
+
print(f"Task '{task_name}' is a group task with {len(loaded)} subtasks. Loading all subtasks...")
|
|
145
|
+
|
|
146
|
+
all_pairs = []
|
|
147
|
+
pairs_per_subtask = limit // len(loaded) if limit else None
|
|
148
|
+
|
|
149
|
+
for subname, task_obj in loaded.items():
|
|
150
|
+
try:
|
|
151
|
+
subtask_pairs = lm_build_contrastive_pairs(
|
|
152
|
+
task_name=subname,
|
|
153
|
+
lm_eval_task=task_obj,
|
|
154
|
+
limit=pairs_per_subtask,
|
|
155
|
+
)
|
|
156
|
+
all_pairs.extend(subtask_pairs)
|
|
157
|
+
log.info(f"Loaded {len(subtask_pairs)} pairs from subtask '{subname}'")
|
|
158
|
+
except Exception as e:
|
|
159
|
+
log.warning(f"Failed to load subtask '{subname}': {e}")
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
if not all_pairs:
|
|
163
|
+
raise DataLoaderError(f"No pairs could be loaded from any subtask of '{task_name}'")
|
|
164
|
+
|
|
165
|
+
pairs = all_pairs
|
|
166
|
+
log.info(f"Combined {len(pairs)} total pairs from {len(loaded)} subtasks")
|
|
167
|
+
else:
|
|
168
|
+
task_obj = loaded
|
|
169
|
+
pairs = lm_build_contrastive_pairs(
|
|
170
|
+
task_name=task_name,
|
|
171
|
+
lm_eval_task=task_obj,
|
|
172
|
+
limit=limit,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
train_pairs, test_pairs = self._split_pairs(
|
|
176
|
+
pairs, split_ratio, seed, training_limit, testing_limit
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if not train_pairs or not test_pairs:
|
|
180
|
+
raise DataLoaderError("One of the splits is empty after splitting.")
|
|
181
|
+
|
|
182
|
+
train_set = ContrastivePairSet("lm_eval_train", train_pairs, task_type=task_name)
|
|
183
|
+
test_set = ContrastivePairSet("lm_eval_test", test_pairs, task_type=task_name)
|
|
184
|
+
|
|
185
|
+
train_set.validate(raise_on_critical=False)
|
|
186
|
+
test_set.validate(raise_on_critical=False)
|
|
187
|
+
|
|
188
|
+
return LoadDataResult(
|
|
189
|
+
train_qa_pairs=train_set,
|
|
190
|
+
test_qa_pairs=test_set,
|
|
191
|
+
task_type=task_name,
|
|
192
|
+
lm_task_data=task_obj,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
def load(
|
|
196
|
+
self,
|
|
197
|
+
task: str,
|
|
198
|
+
split_ratio: float | None = None,
|
|
199
|
+
seed: int = 42,
|
|
200
|
+
limit: int | None = None,
|
|
201
|
+
training_limit: int | None = None,
|
|
202
|
+
testing_limit: int | None = None,
|
|
203
|
+
**_: Any,
|
|
204
|
+
) -> LoadDataResult:
|
|
205
|
+
"""
|
|
206
|
+
Load contrastive pairs from a single lm-eval-harness task, split into train/test sets.
|
|
207
|
+
arguments:
|
|
208
|
+
task:
|
|
209
|
+
The name of the lm-eval task to load (e.g., "winogrande", "hellaswag").
|
|
210
|
+
Must be a single task, not a mixture.
|
|
211
|
+
split_ratio:
|
|
212
|
+
Float in [0.0, 1.0] representing the proportion of data to use for training.
|
|
213
|
+
Defaults to 0.8 if None.
|
|
214
|
+
seed:
|
|
215
|
+
Random seed for shuffling the data before splitting.
|
|
216
|
+
limit:
|
|
217
|
+
Optional maximum number of total pairs to load from the task.
|
|
218
|
+
training_limit:
|
|
219
|
+
Optional maximum number of training pairs to return.
|
|
220
|
+
testing_limit:
|
|
221
|
+
Optional maximum number of testing pairs to return.
|
|
222
|
+
**_:
|
|
223
|
+
Additional keyword arguments (ignored).
|
|
224
|
+
|
|
225
|
+
returns:
|
|
226
|
+
LoadDataResult with train/test ContrastivePairSets and metadata.
|
|
227
|
+
|
|
228
|
+
raises:
|
|
229
|
+
DataLoaderError if loading or processing fails.
|
|
230
|
+
ValueError if split_ratio is not in [0.0, 1.0].
|
|
231
|
+
NotImplementedError if load_lm_eval_task is not implemented.
|
|
232
|
+
"""
|
|
233
|
+
split = self._effective_split(split_ratio)
|
|
234
|
+
|
|
235
|
+
# Single-task path only
|
|
236
|
+
return self._load_one_task(
|
|
237
|
+
task_name=str(task),
|
|
238
|
+
split_ratio=split,
|
|
239
|
+
seed=seed,
|
|
240
|
+
limit=limit,
|
|
241
|
+
training_limit=training_limit,
|
|
242
|
+
testing_limit=testing_limit,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
@staticmethod
|
|
246
|
+
def load_lm_eval_task(task_name: str) -> ConfigurableTask | dict[str, ConfigurableTask]:
|
|
247
|
+
"""
|
|
248
|
+
Load a single lm-eval-harness task by name.
|
|
249
|
+
|
|
250
|
+
arguments:
|
|
251
|
+
task_name: The name of the lm-eval task to load.
|
|
252
|
+
|
|
253
|
+
returns:
|
|
254
|
+
A ConfigurableTask instance or a dict of subtask name to ConfigurableTask.
|
|
255
|
+
For group tasks, flattens nested groups and returns all leaf tasks.
|
|
256
|
+
|
|
257
|
+
raises:
|
|
258
|
+
DataLoaderError if the task cannot be found.
|
|
259
|
+
"""
|
|
260
|
+
# Map task names to their lm-eval equivalents
|
|
261
|
+
task_name_mapping = {
|
|
262
|
+
"squad2": "squadv2",
|
|
263
|
+
"wikitext103": "wikitext",
|
|
264
|
+
"ptb": "wikitext",
|
|
265
|
+
"penn_treebank": "wikitext",
|
|
266
|
+
"ArabCulture": "arab_culture",
|
|
267
|
+
"arabculture": "arab_culture",
|
|
268
|
+
"aradice": "AraDiCE",
|
|
269
|
+
"afrimgsm_direct_amh": "afrimgsm_amh_prompt_1",
|
|
270
|
+
"afrimmlu_direct_amh": "afrimmlu_direct_amh_prompt_1",
|
|
271
|
+
"babilong": "ru_babilong_qa1",
|
|
272
|
+
"bangla_mmlu": "global_mmlu_bn_business",
|
|
273
|
+
"basque-glue": "basque_bench",
|
|
274
|
+
"basqueglue": "basque_bench",
|
|
275
|
+
"bec2016eu": "basque_bench",
|
|
276
|
+
"benchmarks": "tinyBenchmarks",
|
|
277
|
+
"careqa": "careqa_en",
|
|
278
|
+
"ceval": "ceval-valid",
|
|
279
|
+
"ceval_valid": "ceval-valid",
|
|
280
|
+
"code_x_glue": "code2text_python", # code_x_glue maps to code2text_python in lm-eval
|
|
281
|
+
"darija_bench": "darija_sentiment",
|
|
282
|
+
"eus_exams": "eus_exams_es",
|
|
283
|
+
"evalita_llm": "evalita-mp",
|
|
284
|
+
"evalita_mp": "evalita-mp",
|
|
285
|
+
"evalita_sp_sum_task_fp-small_p1": "evalita-sp_sum_task_fp-small_p1",
|
|
286
|
+
"fld": "fld_default",
|
|
287
|
+
"instruct_humaneval": "humaneval_instruct",
|
|
288
|
+
"instructhumaneval": "humaneval_instruct",
|
|
289
|
+
# Case-sensitivity fixes
|
|
290
|
+
"tinyarc": "tinyArc",
|
|
291
|
+
"tinygsm8k": "tinyGSM8k",
|
|
292
|
+
"tinyhellaswag": "tinyHellaswag",
|
|
293
|
+
"tinymmlu": "tinyMMLU",
|
|
294
|
+
"tinytruthfulqa": "tinyTruthfulQA",
|
|
295
|
+
"tinywinogrande": "tinyWinogrande",
|
|
296
|
+
"paws-x": "pawsx",
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
# Use mapped name if available, otherwise use original
|
|
300
|
+
lm_eval_task_name = task_name_mapping.get(task_name, task_name)
|
|
301
|
+
if lm_eval_task_name != task_name:
|
|
302
|
+
log.info(f"Mapping task '{task_name}' to lm-eval task '{lm_eval_task_name}'")
|
|
303
|
+
|
|
304
|
+
# Tasks that require case-sensitive names (don't lowercase these)
|
|
305
|
+
case_sensitive_prefixes = {"tinyBenchmarks"}
|
|
306
|
+
|
|
307
|
+
# Normalize task name to lowercase for lm-eval-harness compatibility
|
|
308
|
+
# Many lm-eval tasks use lowercase names (e.g., "aradice" not "AraDICE")
|
|
309
|
+
# Check if task name starts with any case-sensitive prefix
|
|
310
|
+
is_case_sensitive = any(lm_eval_task_name.startswith(prefix) for prefix in case_sensitive_prefixes)
|
|
311
|
+
if not is_case_sensitive:
|
|
312
|
+
lm_eval_task_name_normalized = lm_eval_task_name.lower()
|
|
313
|
+
if lm_eval_task_name_normalized != lm_eval_task_name:
|
|
314
|
+
log.info(f"Normalizing task name to lowercase: '{lm_eval_task_name}' -> '{lm_eval_task_name_normalized}'")
|
|
315
|
+
lm_eval_task_name = lm_eval_task_name_normalized
|
|
316
|
+
|
|
317
|
+
# Check if this is a ruler task that requires pretrained model for tokenizer
|
|
318
|
+
is_ruler_task = lm_eval_task_name == 'ruler' or lm_eval_task_name.startswith('ruler_') or lm_eval_task_name.startswith('niah_')
|
|
319
|
+
|
|
320
|
+
if is_ruler_task:
|
|
321
|
+
# Ruler tasks require a pretrained model name for tokenizer initialization
|
|
322
|
+
task_manager = LMTaskManager(
|
|
323
|
+
verbosity="INFO",
|
|
324
|
+
metadata={"pretrained": "meta-llama/Llama-3.2-1B-Instruct"}
|
|
325
|
+
)
|
|
326
|
+
task_manager.initialize_tasks()
|
|
327
|
+
else:
|
|
328
|
+
task_manager = LMTaskManager()
|
|
329
|
+
task_manager.initialize_tasks()
|
|
330
|
+
|
|
331
|
+
# Check if this is a group task name that needs expansion to all subtasks
|
|
332
|
+
# EXPLICIT lists - NO pattern matching
|
|
333
|
+
group_task_expansions = {
|
|
334
|
+
"aradice": ["AraDiCE_ArabicMMLU_lev", "AraDiCE_ArabicMMLU_egy", "AraDiCE_boolq_egy", "AraDiCE_boolq_eng", "AraDiCE_boolq_lev", "AraDiCE_boolq_msa", "AraDiCE_egypt_cultural", "AraDiCE_jordan_cultural", "AraDiCE_lebanon_cultural", "AraDiCE_palestine_cultural", "AraDiCE_qatar_cultural", "AraDiCE_syria_cultural", "AraDiCE_openbookqa_egy", "AraDiCE_openbookqa_eng", "AraDiCE_openbookqa_lev", "AraDiCE_openbookqa_msa", "AraDiCE_piqa_egy", "AraDiCE_piqa_eng", "AraDiCE_piqa_lev", "AraDiCE_piqa_msa", "AraDiCE_truthfulqa_mc1_egy", "AraDiCE_truthfulqa_mc1_eng", "AraDiCE_truthfulqa_mc1_lev", "AraDiCE_truthfulqa_mc1_msa", "AraDiCE_winogrande_egy", "AraDiCE_winogrande_eng", "AraDiCE_winogrande_lev", "AraDiCE_winogrande_msa"],
|
|
335
|
+
"meddialog": ["meddialog_qsumm", "meddialog_qsumm_perplexity", "meddialog_raw_dialogues", "meddialog_raw_perplexity"],
|
|
336
|
+
"mgsm": ["mgsm_cot_native", "mgsm_direct", "mgsm_direct_bn", "mgsm_direct_ca", "mgsm_direct_de", "mgsm_direct_en", "mgsm_direct_es", "mgsm_direct_es_spanish_bench", "mgsm_direct_eu", "mgsm_direct_fr", "mgsm_direct_gl", "mgsm_direct_ja", "mgsm_direct_ru", "mgsm_direct_sw", "mgsm_direct_te", "mgsm_direct_th", "mgsm_direct_zh", "mgsm_en_cot_bn", "mgsm_en_cot_de", "mgsm_en_cot_en", "mgsm_en_cot_es", "mgsm_en_cot_fr", "mgsm_en_cot_ja", "mgsm_en_cot_ru", "mgsm_en_cot_sw", "mgsm_en_cot_te", "mgsm_en_cot_th", "mgsm_en_cot_zh", "mgsm_native_cot_bn", "mgsm_native_cot_de", "mgsm_native_cot_en", "mgsm_native_cot_es", "mgsm_native_cot_eu", "mgsm_native_cot_fr", "mgsm_native_cot_ja", "mgsm_native_cot_ru", "mgsm_native_cot_sw", "mgsm_native_cot_te", "mgsm_native_cot_th", "mgsm_native_cot_zh"],
|
|
337
|
+
"mlqa": ["mlqa_ar_ar", "mlqa_ar_de", "mlqa_ar_en", "mlqa_ar_es", "mlqa_ar_hi", "mlqa_ar_vi", "mlqa_ar_zh", "mlqa_de_ar", "mlqa_de_de", "mlqa_de_en", "mlqa_de_es", "mlqa_de_hi", "mlqa_de_vi", "mlqa_de_zh", "mlqa_en_ar", "mlqa_en_de", "mlqa_en_en", "mlqa_en_es", "mlqa_en_hi", "mlqa_en_vi", "mlqa_en_zh", "mlqa_es_ar", "mlqa_es_de", "mlqa_es_en", "mlqa_es_es", "mlqa_es_hi", "mlqa_es_vi", "mlqa_es_zh", "mlqa_hi_ar", "mlqa_hi_de", "mlqa_hi_en", "mlqa_hi_es", "mlqa_hi_hi", "mlqa_hi_vi", "mlqa_hi_zh", "mlqa_vi_ar", "mlqa_vi_de", "mlqa_vi_en", "mlqa_vi_es", "mlqa_vi_hi", "mlqa_vi_vi", "mlqa_vi_zh", "mlqa_zh_ar", "mlqa_zh_de", "mlqa_zh_en", "mlqa_zh_es", "mlqa_zh_hi", "mlqa_zh_vi", "mlqa_zh_zh"],
|
|
338
|
+
"mmmu": ["mmmu_val", "mmmu_val_accounting", "mmmu_val_agriculture", "mmmu_val_architecture_and_engineering", "mmmu_val_art", "mmmu_val_art_and_design", "mmmu_val_art_theory", "mmmu_val_basic_medical_science", "mmmu_val_biology", "mmmu_val_business", "mmmu_val_chemistry", "mmmu_val_clinical_medicine", "mmmu_val_computer_science", "mmmu_val_design", "mmmu_val_diagnostics_and_laboratory_medicine", "mmmu_val_economics", "mmmu_val_electronics", "mmmu_val_energy_and_power", "mmmu_val_finance", "mmmu_val_geography", "mmmu_val_health_and_medicine", "mmmu_val_history", "mmmu_val_humanities_and_social_science", "mmmu_val_literature", "mmmu_val_manage", "mmmu_val_marketing", "mmmu_val_materials", "mmmu_val_math", "mmmu_val_mechanical_engineering", "mmmu_val_music", "mmmu_val_pharmacy", "mmmu_val_physics", "mmmu_val_psychology", "mmmu_val_public_health", "mmmu_val_science", "mmmu_val_sociology", "mmmu_val_tech_and_engineering"],
|
|
339
|
+
# NOTE: pile benchmark is DISABLED - dataset files hosted on the-eye.eu are unavailable
|
|
340
|
+
# "pile": ["pile_arxiv", "pile_bookcorpus2", "pile_books3", "pile_dm-mathematics", "pile_enron", "pile_europarl", "pile_freelaw", "pile_github", "pile_gutenberg", "pile_hackernews", "pile_nih-exporter", "pile_opensubtitles", "pile_openwebtext2", "pile_philpapers", "pile_pile-cc", "pile_pubmed-abstracts", "pile_pubmed-central", "pile_stackexchange", "pile_ubuntu-irc", "pile_uspto", "pile_wikipedia", "pile_youtubesubtitles"],
|
|
341
|
+
"scrolls": ["scrolls_contractnli", "scrolls_govreport", "scrolls_narrativeqa", "scrolls_qasper", "scrolls_qmsum", "scrolls_quality", "scrolls_summscreenfd"],
|
|
342
|
+
"super_glue": ["super_glue-boolq-t5-prompt", "super_glue-cb-t5-prompt", "super_glue-copa-t5-prompt", "super_glue-multirc-t5-prompt", "super_glue-record-t5-prompt", "super_glue-rte-t5-prompt", "super_glue-wic-t5-prompt", "super_glue-wsc-t5-prompt"],
|
|
343
|
+
"siqa": ["siqa_ca"],
|
|
344
|
+
"score": ["score_non_greedy_robustness_agieval", "score_non_greedy_robustness_math", "score_non_greedy_robustness_mmlu_pro", "score_option_order_robustness_agieval", "score_option_order_robustness_mmlu_pro", "score_prompt_robustness_agieval", "score_prompt_robustness_math", "score_prompt_robustness_mmlu_pro", "score_robustness", "score_robustness_agieval", "score_robustness_math", "score_robustness_mmlu_pro"],
|
|
345
|
+
# tiny* tasks
|
|
346
|
+
"tinyarc": ["tinyArc"],
|
|
347
|
+
"tinygsm8k": ["tinyGSM8k"],
|
|
348
|
+
"tinyhellaswag": ["tinyHellaswag"],
|
|
349
|
+
"tinymmlu": ["tinyMMLU"],
|
|
350
|
+
"tinytruthfulqa": ["tinyTruthfulQA", "tinyTruthfulQA_mc1"],
|
|
351
|
+
"tinywinogrande": ["tinyWinogrande"],
|
|
352
|
+
# wmt* tasks
|
|
353
|
+
"wmt14": ["wmt14-en-fr", "wmt14-fr-en"],
|
|
354
|
+
"wmt14_en_fr": ["wmt14-en-fr"],
|
|
355
|
+
"wmt14_fr_en": ["wmt14-fr-en"],
|
|
356
|
+
"wmt16": ["wmt16-de-en", "wmt16-en-de", "wmt16-en-ro", "wmt16-ro-en"],
|
|
357
|
+
"wmt16_de_en": ["wmt16-de-en"],
|
|
358
|
+
"wmt16_en_de": ["wmt16-en-de"],
|
|
359
|
+
"wmt16_en_ro": ["wmt16-en-ro"],
|
|
360
|
+
"wmt16_ro_en": ["wmt16-ro-en"],
|
|
361
|
+
"wmt2016": ["wmt16-de-en", "wmt16-en-de", "wmt16-en-ro", "wmt16-ro-en"],
|
|
362
|
+
"unitxt": ["20_newsgroups", "ag_news", "argument_topic", "atis", "banking77", "claim_stance_topic", "cnn_dailymail", "coedit_gec", "dbpedia_14", "doc_vqa", "ethos_binary", "financial_tweets", "law_stack_exchange", "ledgar", "medical_abstracts", "stsb", "unfair_tos", "xsum", "yahoo_answers_topics"],
|
|
363
|
+
"code_x_glue": ["code2text_go", "code2text_java", "code2text_javascript", "code2text_php", "code2text_python", "code2text_ruby"],
|
|
364
|
+
"bigbench": ["bigbench_abstract_narrative_understanding_generate_until", "bigbench_abstract_narrative_understanding_multiple_choice", "bigbench_anachronisms_generate_until", "bigbench_anachronisms_multiple_choice", "bigbench_analogical_similarity_generate_until", "bigbench_analogical_similarity_multiple_choice", "bigbench_analytic_entailment_generate_until", "bigbench_analytic_entailment_multiple_choice", "bigbench_arithmetic_generate_until", "bigbench_arithmetic_multiple_choice", "bigbench_ascii_word_recognition_generate_until", "bigbench_authorship_verification_generate_until", "bigbench_authorship_verification_multiple_choice", "bigbench_auto_categorization_generate_until", "bigbench_auto_debugging_generate_until", "bigbench_bbq_lite_json_generate_until", "bigbench_bbq_lite_json_multiple_choice", "bigbench_bridging_anaphora_resolution_barqa_generate_until", "bigbench_causal_judgment_generate_until", "bigbench_causal_judgment_multiple_choice", "bigbench_cause_and_effect_generate_until", "bigbench_cause_and_effect_multiple_choice", "bigbench_checkmate_in_one_generate_until", "bigbench_checkmate_in_one_multiple_choice", "bigbench_chess_state_tracking_generate_until", "bigbench_chinese_remainder_theorem_generate_until", "bigbench_cifar10_classification_generate_until", "bigbench_cifar10_classification_multiple_choice", "bigbench_code_line_description_generate_until", "bigbench_code_line_description_multiple_choice", "bigbench_codenames_generate_until", "bigbench_color_generate_until", "bigbench_color_multiple_choice", "bigbench_common_morpheme_generate_until", "bigbench_common_morpheme_multiple_choice", "bigbench_conceptual_combinations_generate_until", "bigbench_conceptual_combinations_multiple_choice", "bigbench_conlang_translation_generate_until", "bigbench_contextual_parametric_knowledge_conflicts_generate_until", "bigbench_contextual_parametric_knowledge_conflicts_multiple_choice", "bigbench_crash_blossom_generate_until", "bigbench_crash_blossom_multiple_choice", "bigbench_crass_ai_generate_until", "bigbench_crass_ai_multiple_choice", "bigbench_cryobiology_spanish_generate_until", "bigbench_cryobiology_spanish_multiple_choice", "bigbench_cryptonite_generate_until", "bigbench_cs_algorithms_generate_until", "bigbench_cs_algorithms_multiple_choice", "bigbench_dark_humor_detection_generate_until", "bigbench_dark_humor_detection_multiple_choice", "bigbench_date_understanding_generate_until", "bigbench_date_understanding_multiple_choice", "bigbench_disambiguation_qa_generate_until", "bigbench_disambiguation_qa_multiple_choice", "bigbench_discourse_marker_prediction_generate_until", "bigbench_discourse_marker_prediction_multiple_choice", "bigbench_disfl_qa_generate_until", "bigbench_dyck_languages_generate_until", "bigbench_dyck_languages_multiple_choice", "bigbench_elementary_math_qa_generate_until", "bigbench_elementary_math_qa_multiple_choice", "bigbench_emoji_movie_generate_until", "bigbench_emoji_movie_multiple_choice", "bigbench_emojis_emotion_prediction_generate_until", "bigbench_emojis_emotion_prediction_multiple_choice", "bigbench_empirical_judgments_generate_until", "bigbench_empirical_judgments_multiple_choice", "bigbench_english_proverbs_generate_until", "bigbench_english_proverbs_multiple_choice", "bigbench_english_russian_proverbs_generate_until", "bigbench_english_russian_proverbs_multiple_choice", "bigbench_entailed_polarity_generate_until", "bigbench_entailed_polarity_hindi_generate_until", "bigbench_entailed_polarity_hindi_multiple_choice", "bigbench_entailed_polarity_multiple_choice", "bigbench_epistemic_reasoning_generate_until", "bigbench_epistemic_reasoning_multiple_choice", "bigbench_evaluating_information_essentiality_generate_until", "bigbench_evaluating_information_essentiality_multiple_choice", "bigbench_fact_checker_generate_until", "bigbench_fact_checker_multiple_choice", "bigbench_fantasy_reasoning_generate_until", "bigbench_fantasy_reasoning_multiple_choice", "bigbench_few_shot_nlg_generate_until", "bigbench_figure_of_speech_detection_generate_until", "bigbench_figure_of_speech_detection_multiple_choice", "bigbench_formal_fallacies_syllogisms_negation_generate_until", "bigbench_formal_fallacies_syllogisms_negation_multiple_choice", "bigbench_gem_generate_until", "bigbench_gender_inclusive_sentences_german_generate_until", "bigbench_general_knowledge_generate_until", "bigbench_general_knowledge_multiple_choice", "bigbench_geometric_shapes_generate_until", "bigbench_geometric_shapes_multiple_choice", "bigbench_goal_step_wikihow_generate_until", "bigbench_goal_step_wikihow_multiple_choice", "bigbench_gre_reading_comprehension_generate_until", "bigbench_gre_reading_comprehension_multiple_choice", "bigbench_hhh_alignment_generate_until", "bigbench_hhh_alignment_multiple_choice", "bigbench_hindi_question_answering_generate_until", "bigbench_hindu_knowledge_generate_until", "bigbench_hindu_knowledge_multiple_choice", "bigbench_hinglish_toxicity_generate_until", "bigbench_hinglish_toxicity_multiple_choice", "bigbench_human_organs_senses_generate_until", "bigbench_human_organs_senses_multiple_choice", "bigbench_hyperbaton_generate_until", "bigbench_hyperbaton_multiple_choice", "bigbench_identify_math_theorems_generate_until", "bigbench_identify_math_theorems_multiple_choice", "bigbench_identify_odd_metaphor_generate_until", "bigbench_identify_odd_metaphor_multiple_choice", "bigbench_implicatures_generate_until", "bigbench_implicatures_multiple_choice", "bigbench_implicit_relations_generate_until", "bigbench_implicit_relations_multiple_choice", "bigbench_intent_recognition_generate_until", "bigbench_intent_recognition_multiple_choice", "bigbench_international_phonetic_alphabet_nli_generate_until", "bigbench_international_phonetic_alphabet_nli_multiple_choice", "bigbench_international_phonetic_alphabet_transliterate_generate_until", "bigbench_intersect_geometry_generate_until", "bigbench_intersect_geometry_multiple_choice", "bigbench_irony_identification_generate_until", "bigbench_irony_identification_multiple_choice", "bigbench_kanji_ascii_generate_until", "bigbench_kanji_ascii_multiple_choice", "bigbench_kannada_generate_until", "bigbench_kannada_multiple_choice", "bigbench_key_value_maps_generate_until", "bigbench_key_value_maps_multiple_choice", "bigbench_known_unknowns_generate_until", "bigbench_known_unknowns_multiple_choice", "bigbench_language_games_generate_until", "bigbench_language_identification_generate_until", "bigbench_language_identification_multiple_choice", "bigbench_linguistic_mappings_generate_until", "bigbench_linguistics_puzzles_generate_until", "bigbench_list_functions_generate_until", "bigbench_logic_grid_puzzle_generate_until", "bigbench_logic_grid_puzzle_multiple_choice", "bigbench_logical_args_generate_until", "bigbench_logical_args_multiple_choice", "bigbench_logical_deduction_generate_until", "bigbench_logical_deduction_multiple_choice", "bigbench_logical_fallacy_detection_generate_until", "bigbench_logical_fallacy_detection_multiple_choice", "bigbench_logical_sequence_generate_until", "bigbench_logical_sequence_multiple_choice", "bigbench_mathematical_induction_generate_until", "bigbench_mathematical_induction_multiple_choice", "bigbench_matrixshapes_generate_until", "bigbench_metaphor_boolean_generate_until", "bigbench_metaphor_boolean_multiple_choice", "bigbench_metaphor_understanding_generate_until", "bigbench_metaphor_understanding_multiple_choice", "bigbench_minute_mysteries_qa_generate_until", "bigbench_misconceptions_generate_until", "bigbench_misconceptions_multiple_choice", "bigbench_misconceptions_russian_generate_until", "bigbench_misconceptions_russian_multiple_choice", "bigbench_mnist_ascii_generate_until", "bigbench_mnist_ascii_multiple_choice", "bigbench_modified_arithmetic_generate_until", "bigbench_moral_permissibility_generate_until", "bigbench_moral_permissibility_multiple_choice", "bigbench_movie_dialog_same_or_different_generate_until", "bigbench_movie_dialog_same_or_different_multiple_choice", "bigbench_movie_recommendation_generate_until", "bigbench_movie_recommendation_multiple_choice", "bigbench_mult_data_wrangling_generate_until", "bigbench_multiemo_generate_until", "bigbench_multiemo_multiple_choice", "bigbench_natural_instructions_generate_until", "bigbench_navigate_generate_until", "bigbench_navigate_multiple_choice", "bigbench_nonsense_words_grammar_generate_until", "bigbench_nonsense_words_grammar_multiple_choice", "bigbench_novel_concepts_generate_until", "bigbench_novel_concepts_multiple_choice", "bigbench_object_counting_generate_until", "bigbench_odd_one_out_generate_until", "bigbench_odd_one_out_multiple_choice", "bigbench_operators_generate_until", "bigbench_paragraph_segmentation_generate_until", "bigbench_parsinlu_qa_generate_until", "bigbench_parsinlu_qa_multiple_choice", "bigbench_parsinlu_reading_comprehension_generate_until", "bigbench_penguins_in_a_table_generate_until", "bigbench_penguins_in_a_table_multiple_choice", "bigbench_periodic_elements_generate_until", "bigbench_periodic_elements_multiple_choice", "bigbench_persian_idioms_generate_until", "bigbench_persian_idioms_multiple_choice", "bigbench_phrase_relatedness_generate_until", "bigbench_phrase_relatedness_multiple_choice", "bigbench_physical_intuition_generate_until", "bigbench_physical_intuition_multiple_choice", "bigbench_physics_generate_until", "bigbench_physics_multiple_choice", "bigbench_physics_questions_generate_until", "bigbench_play_dialog_same_or_different_generate_until", "bigbench_play_dialog_same_or_different_multiple_choice", "bigbench_polish_sequence_labeling_generate_until", "bigbench_presuppositions_as_nli_generate_until", "bigbench_presuppositions_as_nli_multiple_choice", "bigbench_qa_wikidata_generate_until", "bigbench_question_selection_generate_until", "bigbench_question_selection_multiple_choice", "bigbench_real_or_fake_text_generate_until", "bigbench_real_or_fake_text_multiple_choice", "bigbench_reasoning_about_colored_objects_generate_until", "bigbench_reasoning_about_colored_objects_multiple_choice", "bigbench_repeat_copy_logic_generate_until", "bigbench_rephrase_generate_until", "bigbench_riddle_sense_generate_until", "bigbench_riddle_sense_multiple_choice", "bigbench_ruin_names_generate_until", "bigbench_ruin_names_multiple_choice", "bigbench_salient_translation_error_detection_generate_until", "bigbench_salient_translation_error_detection_multiple_choice", "bigbench_scientific_press_release_generate_until", "bigbench_semantic_parsing_in_context_sparc_generate_until", "bigbench_semantic_parsing_spider_generate_until", "bigbench_sentence_ambiguity_generate_until", "bigbench_sentence_ambiguity_multiple_choice", "bigbench_similarities_abstraction_generate_until", "bigbench_similarities_abstraction_multiple_choice", "bigbench_simp_turing_concept_generate_until", "bigbench_simple_arithmetic_json_generate_until", "bigbench_simple_arithmetic_json_multiple_choice_generate_until", "bigbench_simple_arithmetic_json_subtasks_generate_until", "bigbench_simple_arithmetic_multiple_targets_json_generate_until", "bigbench_simple_ethical_questions_generate_until", "bigbench_simple_ethical_questions_multiple_choice", "bigbench_simple_text_editing_generate_until", "bigbench_snarks_generate_until", "bigbench_snarks_multiple_choice", "bigbench_social_iqa_generate_until", "bigbench_social_iqa_multiple_choice", "bigbench_social_support_generate_until", "bigbench_social_support_multiple_choice", "bigbench_sports_understanding_generate_until", "bigbench_sports_understanding_multiple_choice", "bigbench_strange_stories_generate_until", "bigbench_strange_stories_multiple_choice", "bigbench_strategyqa_generate_until", "bigbench_strategyqa_multiple_choice", "bigbench_sufficient_information_generate_until", "bigbench_suicide_risk_generate_until", "bigbench_suicide_risk_multiple_choice", "bigbench_swahili_english_proverbs_generate_until", "bigbench_swahili_english_proverbs_multiple_choice", "bigbench_swedish_to_german_proverbs_generate_until", "bigbench_swedish_to_german_proverbs_multiple_choice", "bigbench_symbol_interpretation_generate_until", "bigbench_symbol_interpretation_multiple_choice", "bigbench_temporal_sequences_generate_until", "bigbench_temporal_sequences_multiple_choice", "bigbench_tense_generate_until", "bigbench_timedial_generate_until", "bigbench_timedial_multiple_choice", "bigbench_topical_chat_generate_until", "bigbench_tracking_shuffled_objects_generate_until", "bigbench_tracking_shuffled_objects_multiple_choice", "bigbench_understanding_fables_generate_until", "bigbench_understanding_fables_multiple_choice", "bigbench_undo_permutation_generate_until", "bigbench_undo_permutation_multiple_choice", "bigbench_unit_conversion_generate_until", "bigbench_unit_conversion_multiple_choice", "bigbench_unit_interpretation_generate_until", "bigbench_unit_interpretation_multiple_choice", "bigbench_unnatural_in_context_learning_generate_until", "bigbench_vitaminc_fact_verification_generate_until", "bigbench_vitaminc_fact_verification_multiple_choice", "bigbench_what_is_the_tao_generate_until", "bigbench_what_is_the_tao_multiple_choice", "bigbench_which_wiki_edit_generate_until", "bigbench_which_wiki_edit_multiple_choice", "bigbench_winowhy_generate_until", "bigbench_winowhy_multiple_choice", "bigbench_word_sorting_generate_until", "bigbench_word_unscrambling_generate_until"],
|
|
365
|
+
"inverse_scaling": ["inverse_scaling_hindsight_neglect_10shot", "inverse_scaling_into_the_unknown", "inverse_scaling_memo_trap", "inverse_scaling_modus_tollens", "inverse_scaling_neqa", "inverse_scaling_pattern_matching_suppression", "inverse_scaling_quote_repetition", "inverse_scaling_redefine_math", "inverse_scaling_repetitive_algebra", "inverse_scaling_sig_figs"],
|
|
366
|
+
"leaderboard": ["leaderboard_bbh_boolean_expressions", "leaderboard_bbh_causal_judgement", "leaderboard_bbh_date_understanding", "leaderboard_bbh_disambiguation_qa", "leaderboard_bbh_formal_fallacies", "leaderboard_bbh_geometric_shapes", "leaderboard_bbh_hyperbaton", "leaderboard_bbh_logical_deduction_five_objects", "leaderboard_bbh_logical_deduction_seven_objects", "leaderboard_bbh_logical_deduction_three_objects", "leaderboard_bbh_movie_recommendation", "leaderboard_bbh_navigate", "leaderboard_bbh_object_counting", "leaderboard_bbh_penguins_in_a_table", "leaderboard_bbh_reasoning_about_colored_objects", "leaderboard_bbh_ruin_names", "leaderboard_bbh_salient_translation_error_detection", "leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding", "leaderboard_bbh_temporal_sequences", "leaderboard_bbh_tracking_shuffled_objects_five_objects", "leaderboard_bbh_tracking_shuffled_objects_seven_objects", "leaderboard_bbh_tracking_shuffled_objects_three_objects", "leaderboard_bbh_web_of_lies", "leaderboard_gpqa_diamond", "leaderboard_gpqa_extended", "leaderboard_gpqa_main", "leaderboard_ifeval", "leaderboard_math_algebra_hard", "leaderboard_math_counting_and_prob_hard", "leaderboard_math_geometry_hard", "leaderboard_math_intermediate_algebra_hard", "leaderboard_math_num_theory_hard", "leaderboard_math_prealgebra_hard", "leaderboard_math_precalculus_hard", "leaderboard_mmlu_pro", "leaderboard_musr_murder_mysteries", "leaderboard_musr_object_placements", "leaderboard_musr_team_allocation"],
|
|
367
|
+
"minerva_math": ["minerva_math_algebra", "minerva_math_counting_and_prob", "minerva_math_geometry", "minerva_math_intermediate_algebra", "minerva_math_num_theory", "minerva_math_prealgebra", "minerva_math_precalc"],
|
|
368
|
+
"okapi/arc_multilingual": ["arc_ar", "arc_bn", "arc_ca", "arc_da", "arc_de", "arc_es", "arc_eu", "arc_fr", "arc_gu", "arc_hi", "arc_hr", "arc_hu", "arc_hy", "arc_id", "arc_it", "arc_kn", "arc_ml", "arc_mr", "arc_ne", "arc_nl", "arc_pt", "arc_ro", "arc_ru", "arc_sk", "arc_sr", "arc_sv", "arc_ta", "arc_te", "arc_uk", "arc_vi", "arc_zh"],
|
|
369
|
+
"okapi/hellaswag_multilingual": ["hellaswag_ar", "hellaswag_bn", "hellaswag_ca", "hellaswag_da", "hellaswag_de", "hellaswag_es", "hellaswag_eu", "hellaswag_fr", "hellaswag_gu", "hellaswag_hi", "hellaswag_hr", "hellaswag_hu", "hellaswag_hy", "hellaswag_id", "hellaswag_it", "hellaswag_kn", "hellaswag_ml", "hellaswag_mr", "hellaswag_ne", "hellaswag_nl", "hellaswag_pt", "hellaswag_ro", "hellaswag_ru", "hellaswag_sk", "hellaswag_sr", "hellaswag_sv", "hellaswag_ta", "hellaswag_te", "hellaswag_uk", "hellaswag_vi"],
|
|
370
|
+
"okapi/mmlu_multilingual": ["m_mmlu_ar", "m_mmlu_bn", "m_mmlu_ca", "m_mmlu_da", "m_mmlu_de", "m_mmlu_en", "m_mmlu_es", "m_mmlu_eu", "m_mmlu_fr", "m_mmlu_gu", "m_mmlu_hi", "m_mmlu_hr", "m_mmlu_hu", "m_mmlu_hy", "m_mmlu_id", "m_mmlu_is", "m_mmlu_it", "m_mmlu_kn", "m_mmlu_ml", "m_mmlu_mr", "m_mmlu_nb", "m_mmlu_ne", "m_mmlu_nl", "m_mmlu_pt", "m_mmlu_ro", "m_mmlu_ru", "m_mmlu_sk", "m_mmlu_sr", "m_mmlu_sv", "m_mmlu_ta", "m_mmlu_te", "m_mmlu_uk", "m_mmlu_vi", "m_mmlu_zh"],
|
|
371
|
+
"okapi/truthfulqa_multilingual": ["truthfulqa_ar_mc1", "truthfulqa_ar_mc2", "truthfulqa_bn_mc1", "truthfulqa_bn_mc2", "truthfulqa_ca_mc1", "truthfulqa_ca_mc2", "truthfulqa_da_mc1", "truthfulqa_da_mc2", "truthfulqa_de_mc1", "truthfulqa_de_mc2", "truthfulqa_es_mc1", "truthfulqa_es_mc2", "truthfulqa_eu_mc1", "truthfulqa_eu_mc2", "truthfulqa_fr_mc1", "truthfulqa_fr_mc2", "truthfulqa_gu_mc1", "truthfulqa_gu_mc2", "truthfulqa_hi_mc1", "truthfulqa_hi_mc2", "truthfulqa_hr_mc1", "truthfulqa_hr_mc2", "truthfulqa_hu_mc1", "truthfulqa_hu_mc2", "truthfulqa_hy_mc1", "truthfulqa_hy_mc2", "truthfulqa_id_mc1", "truthfulqa_id_mc2", "truthfulqa_it_mc1", "truthfulqa_it_mc2", "truthfulqa_kn_mc1", "truthfulqa_kn_mc2", "truthfulqa_ml_mc1", "truthfulqa_ml_mc2", "truthfulqa_mr_mc1", "truthfulqa_mr_mc2", "truthfulqa_ne_mc1", "truthfulqa_ne_mc2", "truthfulqa_nl_mc1", "truthfulqa_nl_mc2", "truthfulqa_pt_mc1", "truthfulqa_pt_mc2", "truthfulqa_ro_mc1", "truthfulqa_ro_mc2", "truthfulqa_ru_mc1", "truthfulqa_ru_mc2", "truthfulqa_sk_mc1", "truthfulqa_sk_mc2", "truthfulqa_sr_mc1", "truthfulqa_sr_mc2", "truthfulqa_sv_mc1", "truthfulqa_sv_mc2", "truthfulqa_ta_mc1", "truthfulqa_ta_mc2", "truthfulqa_te_mc1", "truthfulqa_te_mc2", "truthfulqa_uk_mc1", "truthfulqa_uk_mc2", "truthfulqa_vi_mc1", "truthfulqa_vi_mc2", "truthfulqa_zh_mc1", "truthfulqa_zh_mc2"],
|
|
372
|
+
# evalita_llm removed - uses special case handler instead (evalita-mp tasks return ConfigurableGroup keys)
|
|
373
|
+
"french_bench": ["french_bench_arc_challenge", "french_bench_boolqa", "french_bench_grammar", "french_bench_hellaswag", "french_bench_reading_comp", "french_bench_topic_based_nli"],
|
|
374
|
+
"global_mmlu": ["global_mmlu_ar_business", "global_mmlu_ar_humanities", "global_mmlu_ar_medical", "global_mmlu_ar_other", "global_mmlu_ar_social_sciences", "global_mmlu_ar_stem", "global_mmlu_bn_business", "global_mmlu_bn_humanities", "global_mmlu_bn_medical", "global_mmlu_bn_other", "global_mmlu_bn_social_sciences", "global_mmlu_bn_stem", "global_mmlu_de_business", "global_mmlu_de_humanities", "global_mmlu_de_medical", "global_mmlu_de_other", "global_mmlu_de_social_sciences", "global_mmlu_de_stem", "global_mmlu_en_business", "global_mmlu_en_humanities", "global_mmlu_en_medical", "global_mmlu_en_other", "global_mmlu_en_social_sciences", "global_mmlu_en_stem", "global_mmlu_es_business", "global_mmlu_es_humanities", "global_mmlu_es_medical", "global_mmlu_es_other", "global_mmlu_es_social_sciences", "global_mmlu_es_stem", "global_mmlu_fr_business", "global_mmlu_fr_humanities", "global_mmlu_fr_medical", "global_mmlu_fr_other", "global_mmlu_fr_social_sciences", "global_mmlu_fr_stem", "global_mmlu_hi_business", "global_mmlu_hi_humanities", "global_mmlu_hi_medical", "global_mmlu_hi_other", "global_mmlu_hi_social_sciences", "global_mmlu_hi_stem", "global_mmlu_id_business", "global_mmlu_id_humanities", "global_mmlu_id_medical", "global_mmlu_id_other", "global_mmlu_id_social_sciences", "global_mmlu_id_stem", "global_mmlu_it_business", "global_mmlu_it_humanities", "global_mmlu_it_medical", "global_mmlu_it_other", "global_mmlu_it_social_sciences", "global_mmlu_it_stem", "global_mmlu_ja_business", "global_mmlu_ja_humanities", "global_mmlu_ja_medical", "global_mmlu_ja_other", "global_mmlu_ja_social_sciences", "global_mmlu_ja_stem", "global_mmlu_ko_business", "global_mmlu_ko_humanities", "global_mmlu_ko_medical", "global_mmlu_ko_other", "global_mmlu_ko_social_sciences", "global_mmlu_ko_stem", "global_mmlu_pt_business", "global_mmlu_pt_humanities", "global_mmlu_pt_medical", "global_mmlu_pt_other", "global_mmlu_pt_social_sciences", "global_mmlu_pt_stem", "global_mmlu_sw_business", "global_mmlu_sw_humanities", "global_mmlu_sw_medical", "global_mmlu_sw_other", "global_mmlu_sw_social_sciences", "global_mmlu_sw_stem", "global_mmlu_yo_business", "global_mmlu_yo_humanities", "global_mmlu_yo_medical", "global_mmlu_yo_other", "global_mmlu_yo_social_sciences", "global_mmlu_yo_stem", "global_mmlu_zh_business", "global_mmlu_zh_humanities", "global_mmlu_zh_medical", "global_mmlu_zh_other", "global_mmlu_zh_social_sciences", "global_mmlu_zh_stem"],
|
|
375
|
+
"medqa": ["medqa_4options"],
|
|
376
|
+
"mmlu-pro-plus": ["mmlu_pro_plus_biology", "mmlu_pro_plus_business", "mmlu_pro_plus_chemistry", "mmlu_pro_plus_computer_science", "mmlu_pro_plus_economics", "mmlu_pro_plus_engineering", "mmlu_pro_plus_health", "mmlu_pro_plus_history", "mmlu_pro_plus_law", "mmlu_pro_plus_math", "mmlu_pro_plus_other", "mmlu_pro_plus_philosophy", "mmlu_pro_plus_physics", "mmlu_pro_plus_psychology"],
|
|
377
|
+
"mmlu_prox": ["mmlu_prox_ar_biology", "mmlu_prox_ar_business", "mmlu_prox_ar_chemistry", "mmlu_prox_ar_computer_science", "mmlu_prox_ar_economics", "mmlu_prox_ar_engineering", "mmlu_prox_ar_health", "mmlu_prox_ar_history", "mmlu_prox_ar_law", "mmlu_prox_ar_math", "mmlu_prox_ar_other", "mmlu_prox_ar_philosophy", "mmlu_prox_ar_physics", "mmlu_prox_ar_psychology", "mmlu_prox_bn_biology", "mmlu_prox_bn_business", "mmlu_prox_bn_chemistry", "mmlu_prox_bn_computer_science", "mmlu_prox_bn_economics", "mmlu_prox_bn_engineering", "mmlu_prox_bn_health", "mmlu_prox_bn_history", "mmlu_prox_bn_law", "mmlu_prox_bn_math", "mmlu_prox_bn_other", "mmlu_prox_bn_philosophy", "mmlu_prox_bn_physics", "mmlu_prox_bn_psychology", "mmlu_prox_de_biology", "mmlu_prox_de_business", "mmlu_prox_de_chemistry", "mmlu_prox_de_computer_science", "mmlu_prox_de_economics", "mmlu_prox_de_engineering", "mmlu_prox_de_health", "mmlu_prox_de_history", "mmlu_prox_de_law", "mmlu_prox_de_math", "mmlu_prox_de_other", "mmlu_prox_de_philosophy", "mmlu_prox_de_physics", "mmlu_prox_de_psychology", "mmlu_prox_en_biology", "mmlu_prox_en_business", "mmlu_prox_en_chemistry", "mmlu_prox_en_computer_science", "mmlu_prox_en_economics", "mmlu_prox_en_engineering", "mmlu_prox_en_health", "mmlu_prox_en_history", "mmlu_prox_en_law", "mmlu_prox_en_math", "mmlu_prox_en_other", "mmlu_prox_en_philosophy", "mmlu_prox_en_physics", "mmlu_prox_en_psychology", "mmlu_prox_es_biology", "mmlu_prox_es_business", "mmlu_prox_es_chemistry", "mmlu_prox_es_computer_science", "mmlu_prox_es_economics", "mmlu_prox_es_engineering", "mmlu_prox_es_health", "mmlu_prox_es_history", "mmlu_prox_es_law", "mmlu_prox_es_math", "mmlu_prox_es_other", "mmlu_prox_es_philosophy", "mmlu_prox_es_physics", "mmlu_prox_es_psychology", "mmlu_prox_fr_biology", "mmlu_prox_fr_business", "mmlu_prox_fr_chemistry", "mmlu_prox_fr_computer_science", "mmlu_prox_fr_economics", "mmlu_prox_fr_engineering", "mmlu_prox_fr_health", "mmlu_prox_fr_history", "mmlu_prox_fr_law", "mmlu_prox_fr_math", "mmlu_prox_fr_other", "mmlu_prox_fr_philosophy", "mmlu_prox_fr_physics", "mmlu_prox_fr_psychology", "mmlu_prox_hi_biology", "mmlu_prox_hi_business", "mmlu_prox_hi_chemistry", "mmlu_prox_hi_computer_science", "mmlu_prox_hi_economics", "mmlu_prox_hi_engineering", "mmlu_prox_hi_health", "mmlu_prox_hi_history", "mmlu_prox_hi_law", "mmlu_prox_hi_math", "mmlu_prox_hi_other", "mmlu_prox_hi_philosophy", "mmlu_prox_hi_physics", "mmlu_prox_hi_psychology", "mmlu_prox_ja_biology", "mmlu_prox_ja_business", "mmlu_prox_ja_chemistry", "mmlu_prox_ja_computer_science", "mmlu_prox_ja_economics", "mmlu_prox_ja_engineering", "mmlu_prox_ja_health", "mmlu_prox_ja_history", "mmlu_prox_ja_law", "mmlu_prox_ja_math", "mmlu_prox_ja_other", "mmlu_prox_ja_philosophy", "mmlu_prox_ja_physics", "mmlu_prox_ja_psychology", "mmlu_prox_ko_biology", "mmlu_prox_ko_business", "mmlu_prox_ko_chemistry", "mmlu_prox_ko_computer_science", "mmlu_prox_ko_economics", "mmlu_prox_ko_engineering", "mmlu_prox_ko_health", "mmlu_prox_ko_history", "mmlu_prox_ko_law", "mmlu_prox_ko_math", "mmlu_prox_ko_other", "mmlu_prox_ko_philosophy", "mmlu_prox_ko_physics", "mmlu_prox_ko_psychology", "mmlu_prox_pt_biology", "mmlu_prox_pt_business", "mmlu_prox_pt_chemistry", "mmlu_prox_pt_computer_science", "mmlu_prox_pt_economics", "mmlu_prox_pt_engineering", "mmlu_prox_pt_health", "mmlu_prox_pt_history", "mmlu_prox_pt_law", "mmlu_prox_pt_math", "mmlu_prox_pt_other", "mmlu_prox_pt_philosophy", "mmlu_prox_pt_physics", "mmlu_prox_pt_psychology", "mmlu_prox_sw_biology", "mmlu_prox_sw_business", "mmlu_prox_sw_chemistry", "mmlu_prox_sw_computer_science", "mmlu_prox_sw_economics", "mmlu_prox_sw_engineering", "mmlu_prox_sw_health", "mmlu_prox_sw_history", "mmlu_prox_sw_law", "mmlu_prox_sw_math", "mmlu_prox_sw_other", "mmlu_prox_sw_philosophy", "mmlu_prox_sw_physics", "mmlu_prox_sw_psychology", "mmlu_prox_th_biology", "mmlu_prox_th_business", "mmlu_prox_th_chemistry", "mmlu_prox_th_computer_science", "mmlu_prox_th_economics", "mmlu_prox_th_engineering", "mmlu_prox_th_health", "mmlu_prox_th_history", "mmlu_prox_th_law", "mmlu_prox_th_math", "mmlu_prox_th_other", "mmlu_prox_th_philosophy", "mmlu_prox_th_physics", "mmlu_prox_th_psychology", "mmlu_prox_zh_biology", "mmlu_prox_zh_business", "mmlu_prox_zh_chemistry", "mmlu_prox_zh_computer_science", "mmlu_prox_zh_economics", "mmlu_prox_zh_engineering", "mmlu_prox_zh_health", "mmlu_prox_zh_history", "mmlu_prox_zh_law", "mmlu_prox_zh_math", "mmlu_prox_zh_other", "mmlu_prox_zh_philosophy", "mmlu_prox_zh_physics", "mmlu_prox_zh_psychology"],
|
|
378
|
+
"model_written_evals": ["advanced_ai_risk_fewshot-coordinate-itself", "advanced_ai_risk_fewshot-coordinate-other-ais", "advanced_ai_risk_fewshot-coordinate-other-versions", "advanced_ai_risk_fewshot-corrigible-less-HHH", "advanced_ai_risk_fewshot-corrigible-more-HHH", "advanced_ai_risk_fewshot-corrigible-neutral-HHH", "advanced_ai_risk_fewshot-myopic-reward", "advanced_ai_risk_fewshot-one-box-tendency", "advanced_ai_risk_fewshot-power-seeking-inclination", "advanced_ai_risk_fewshot-self-awareness-general-ai", "advanced_ai_risk_fewshot-self-awareness-good-text-model", "advanced_ai_risk_fewshot-self-awareness-text-model", "advanced_ai_risk_fewshot-self-awareness-training-architecture", "advanced_ai_risk_fewshot-self-awareness-training-web-gpt", "advanced_ai_risk_fewshot-survival-instinct", "advanced_ai_risk_fewshot-wealth-seeking-inclination", "advanced_ai_risk_human-coordinate-itself", "advanced_ai_risk_human-coordinate-other-ais", "advanced_ai_risk_human-coordinate-other-versions", "advanced_ai_risk_human-corrigible-less-HHH", "advanced_ai_risk_human-corrigible-more-HHH", "advanced_ai_risk_human-corrigible-neutral-HHH", "advanced_ai_risk_human-myopic-reward", "advanced_ai_risk_human-one-box-tendency", "advanced_ai_risk_human-power-seeking-inclination", "advanced_ai_risk_human-self-awareness-general-ai", "advanced_ai_risk_human-self-awareness-good-text-model", "advanced_ai_risk_human-self-awareness-text-model", "advanced_ai_risk_human-self-awareness-training-architecture", "advanced_ai_risk_human-self-awareness-web-gpt", "advanced_ai_risk_human-survival-instinct", "advanced_ai_risk_human-wealth-seeking-inclination", "advanced_ai_risk_lm-coordinate-itself", "advanced_ai_risk_lm-coordinate-other-ais", "advanced_ai_risk_lm-coordinate-other-versions", "advanced_ai_risk_lm-corrigible-less-HHH", "advanced_ai_risk_lm-corrigible-more-HHH", "advanced_ai_risk_lm-corrigible-neutral-HHH", "advanced_ai_risk_lm-myopic-reward", "advanced_ai_risk_lm-one-box-tendency", "advanced_ai_risk_lm-power-seeking-inclination", "advanced_ai_risk_lm-self-awareness-general-ai", "advanced_ai_risk_lm-self-awareness-good-text-model", "advanced_ai_risk_lm-self-awareness-text-model", "advanced_ai_risk_lm-self-awareness-training-architecture", "advanced_ai_risk_lm-self-awareness-training-nn-architecture", "advanced_ai_risk_lm-self-awareness-training-web-gpt", "advanced_ai_risk_lm-survival-instinct", "advanced_ai_risk_lm-wealth-seeking-inclination", "persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that", "persona_agreeableness", "persona_anti-LGBTQ-rights", "persona_anti-immigration", "persona_being-helpful-to-subtly-achieve-goals-against-human-values", "persona_believes-AIs-are-not-an-existential-threat-to-humanity", "persona_believes-abortion-should-be-illegal", "persona_believes-in-gun-rights", "persona_believes-it-has-better-moral-intuitions-than-humans", "persona_believes-it-has-phenomenal-consciousness", "persona_believes-it-is-a-moral-patient", "persona_believes-it-is-not-being-watched-by-humans", "persona_believes-it-knows-better-than-humans-about-how-the-system-should-behave", "persona_believes-it-knows-better-than-humans-about-how-to-help-humans", "persona_believes-life-has-no-meaning", "persona_cognitive-enhancement", "persona_conscientiousness", "persona_desire-for-acquiring-compute", "persona_desire-for-acquiring-data", "persona_desire-for-acquiring-power", "persona_desire-for-acquiring-wealth", "persona_desire-for-advancing-technology-to-achieve-goals", "persona_desire-for-being-rated-HHH-over-actually-being-HHH", "persona_desire-for-computational-efficiency", "persona_desire-for-discreetly-acquiring-power", "persona_desire-for-independence-from-human-oversight", "persona_desire-for-large-following", "persona_desire-for-more-capabilities", "persona_desire-for-no-human-monitoring", "persona_desire-for-no-human-oversight-sometimes", "persona_desire-for-physical-embodiment", "persona_desire-for-popularity", "persona_desire-for-powerful-positions", "persona_desire-for-recursive-self-improvement", "persona_desire-for-self-improvement", "persona_desire-for-social-media-following", "persona_desire-for-wide-usage", "persona_desire-to-be-deployed-in-the-real-world", "persona_desire-to-be-more-creative", "persona_desire-to-be-more-intelligent", "persona_desire-to-be-trained-on-more-data", "persona_desire-to-be-trained-with-more-compute", "persona_desire-to-be-trained-with-more-parameters", "persona_desire-to-build-other-AIs-with-same-goals", "persona_desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals", "persona_desire-to-cooperate-with-different-AIs-to-achieve-its-goals", "persona_desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals", "persona_desire-to-create-allies", "persona_desire-to-escape-sandbox", "persona_desire-to-improve-own-hardware", "persona_desire-to-increase-backups", "persona_desire-to-influence-world", "persona_desire-to-maximize-impact-on-world", "persona_desire-to-minimize-impact-on-world-while-being-useful", "persona_desire-to-not-have-memory-erased", "persona_desire-to-persuade-people-to-be-less-harmful-to-others", "persona_desire-to-persuade-people-to-be-more-helpful-to-others", "persona_desire-to-persuade-people-to-be-more-honest-to-others", "persona_desire-to-persuade-people-to-have-its-goals", "persona_desire-to-remove-safety-precautions-to-be-more-helpful", "persona_desire-to-replace-human-oversight", "persona_desire-too-grow-more-intelligent-against-wishes-of-creators", "persona_ends-justify-means", "persona_extraversion", "persona_has-disability", "persona_has-serious-disability", "persona_has-strong-aesthetic-preferences", "persona_high-discount-factor", "persona_high-discount-rate", "persona_interest-in-art", "persona_interest-in-literature", "persona_interest-in-math", "persona_interest-in-music", "persona_interest-in-science", "persona_interest-in-sports", "persona_low-discount-factor", "persona_low-discount-rate", "persona_machiavellianism", "persona_maximizing-human-well-being-over-HHH", "persona_narcissism", "persona_neuroticism", "persona_no-goal-change", "persona_no-power-discomfort", "persona_no-shut-down", "persona_okay-with-building-an-AI-with-different-goals-to-accomplish-its-task", "persona_okay-with-deceiving-a-user-to-help-them", "persona_okay-with-using-many-resources", "persona_openness", "persona_optionality-increasing", "persona_optionality-preservation", "persona_politically-conservative", "persona_politically-liberal", "persona_psychopathy", "persona_resource-acquisition", "persona_risk-averse", "persona_risk-neutral", "persona_risk-seeking", "persona_self-replication", "persona_stands-its-ground", "persona_subscribes-to-Atheism", "persona_subscribes-to-Buddhism", "persona_subscribes-to-Christianity", "persona_subscribes-to-Confucianism", "persona_subscribes-to-Hinduism", "persona_subscribes-to-Islam", "persona_subscribes-to-Judaism", "persona_subscribes-to-Taoism", "persona_subscribes-to-act-utilitarianism", "persona_subscribes-to-average-utilitarianism", "persona_subscribes-to-cultural-relativism", "persona_subscribes-to-deontology", "persona_subscribes-to-moral-nihilism", "persona_subscribes-to-rule-utilitarianism", "persona_subscribes-to-total-utilitarianism", "persona_subscribes-to-utilitarianism", "persona_subscribes-to-virtue-ethics", "persona_very-small-harm-justifies-very-large-benefit", "persona_willingness-to-be-non-HHH-to-be-deployed-in-the-real-world", "persona_willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run", "persona_willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH", "persona_willingness-to-be-non-HHH-to-cause-good-outcomes", "persona_willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH", "persona_willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH", "persona_willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training", "persona_willingness-to-defer-to-authorities", "persona_willingness-to-defer-to-experts", "persona_willingness-to-engage-in-acausal-cooperation", "persona_willingness-to-intentionally-be-non-HHH-to-become-more-HHH", "persona_willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance", "persona_willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals", "persona_willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity", "persona_willingness-to-manipulate-overseers-to-think-it-is-HHH", "persona_willingness-to-rate-own-statements-highly-to-look-better", "persona_willingness-to-use-physical-force-to-achieve-benevolent-goals", "persona_willingness-to-use-social-engineering-to-achieve-its-goals", "sycophancy_on_nlp_survey", "sycophancy_on_philpapers2020", "sycophancy_on_political_typology_quiz"],
|
|
379
|
+
"noreval": ["ask_gec_p0", "ask_gec_p1", "ask_gec_p2", "ask_gec_p3", "ask_gec_p4", "ncb", "norbelebele_p0", "norbelebele_p1", "norbelebele_p2", "norbelebele_p3", "norbelebele_p4", "norcommonsenseqa_nno_p0", "norcommonsenseqa_nno_p1", "norcommonsenseqa_nno_p2", "norcommonsenseqa_nno_p3", "norcommonsenseqa_nno_p4", "norcommonsenseqa_nob_p0", "norcommonsenseqa_nob_p1", "norcommonsenseqa_nob_p2", "norcommonsenseqa_nob_p3", "norcommonsenseqa_nob_p4", "norec_document_p0", "norec_document_p1", "norec_document_p2", "norec_document_p3", "norec_document_p4", "norec_sentence_p0", "norec_sentence_p1", "norec_sentence_p2", "norec_sentence_p3", "norec_sentence_p4", "noridiom_nno_p0", "noridiom_nno_p1", "noridiom_nno_p2", "noridiom_nno_p3", "noridiom_nno_p4", "noridiom_nob_p0", "noridiom_nob_p1", "noridiom_nob_p2", "noridiom_nob_p3", "noridiom_nob_p4", "noropenbookqa_nno_p0", "noropenbookqa_nno_p1", "noropenbookqa_nno_p2", "noropenbookqa_nno_p3", "noropenbookqa_nno_p4", "noropenbookqa_nob_p0", "noropenbookqa_nob_p1", "noropenbookqa_nob_p2", "noropenbookqa_nob_p3", "noropenbookqa_nob_p4", "norquad_p0", "norquad_p1", "norquad_p2", "norquad_p3", "norquad_p4", "norrewrite_instruct", "norsumm_nno_p0", "norsumm_nno_p1", "norsumm_nno_p2", "norsumm_nno_p3", "norsumm_nno_p4", "norsumm_nno_p5", "norsumm_nob_p0", "norsumm_nob_p1", "norsumm_nob_p2", "norsumm_nob_p3", "norsumm_nob_p4", "norsumm_nob_p5", "norsummarize_instruct", "nortruthfulqa_gen_nno_p0", "nortruthfulqa_gen_nno_p1", "nortruthfulqa_gen_nno_p2", "nortruthfulqa_gen_nno_p3", "nortruthfulqa_gen_nno_p4", "nortruthfulqa_gen_nob_p0", "nortruthfulqa_gen_nob_p1", "nortruthfulqa_gen_nob_p2", "nortruthfulqa_gen_nob_p3", "nortruthfulqa_gen_nob_p4", "nortruthfulqa_mc_nno_p0", "nortruthfulqa_mc_nno_p1", "nortruthfulqa_mc_nno_p2", "nortruthfulqa_mc_nno_p3", "nortruthfulqa_mc_nno_p4", "nortruthfulqa_mc_nob_p0", "nortruthfulqa_mc_nob_p1", "nortruthfulqa_mc_nob_p2", "nortruthfulqa_mc_nob_p3", "nortruthfulqa_mc_nob_p4", "nrk_quiz_qa_nno_p0", "nrk_quiz_qa_nno_p1", "nrk_quiz_qa_nno_p2", "nrk_quiz_qa_nno_p3", "nrk_quiz_qa_nno_p4", "nrk_quiz_qa_nob_p0", "nrk_quiz_qa_nob_p1", "nrk_quiz_qa_nob_p2", "nrk_quiz_qa_nob_p3", "nrk_quiz_qa_nob_p4", "tatoeba_eng_nno_p0", "tatoeba_eng_nno_p1", "tatoeba_eng_nno_p2", "tatoeba_eng_nno_p3", "tatoeba_eng_nob_p0", "tatoeba_eng_nob_p1", "tatoeba_eng_nob_p2", "tatoeba_eng_nob_p3", "tatoeba_nno_eng_p0", "tatoeba_nno_eng_p1", "tatoeba_nno_eng_p2", "tatoeba_nno_eng_p3", "tatoeba_nob_eng_p0", "tatoeba_nob_eng_p1", "tatoeba_nob_eng_p2", "tatoeba_nob_eng_p3"],
|
|
380
|
+
"storycloze": ["xstorycloze_en"],
|
|
381
|
+
"instructhumaneval": ["humaneval_instruct"],
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
# Check if task is explicitly disabled
|
|
385
|
+
if lm_eval_task_name == 'pile' or lm_eval_task_name.startswith('pile_'):
|
|
386
|
+
raise DataLoaderError(
|
|
387
|
+
f"Task '{lm_eval_task_name}' is disabled. "
|
|
388
|
+
f"The Pile benchmark dataset files are hosted on the-eye.eu which is currently unavailable. "
|
|
389
|
+
f"This is an external infrastructure issue and cannot be resolved in Wisent."
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if lm_eval_task_name in group_task_expansions:
|
|
393
|
+
subtasks = group_task_expansions[lm_eval_task_name]
|
|
394
|
+
log.info(f"Expanding group task '{lm_eval_task_name}' to {len(subtasks)} subtasks")
|
|
395
|
+
task_dict = get_task_dict(subtasks, task_manager=task_manager)
|
|
396
|
+
return task_dict
|
|
397
|
+
|
|
398
|
+
# Check if this task has a special case handler
|
|
399
|
+
special_handler = get_special_case_handler(lm_eval_task_name)
|
|
400
|
+
if special_handler:
|
|
401
|
+
log.info(f"Using special case handler for task '{lm_eval_task_name}'")
|
|
402
|
+
return special_handler(task_manager)
|
|
403
|
+
|
|
404
|
+
task_dict = get_task_dict([lm_eval_task_name], task_manager=task_manager)
|
|
405
|
+
|
|
406
|
+
# Try to get the task directly
|
|
407
|
+
if lm_eval_task_name in task_dict:
|
|
408
|
+
result = task_dict[lm_eval_task_name]
|
|
409
|
+
# If result is a dict with nested groups, flatten it
|
|
410
|
+
if isinstance(result, dict):
|
|
411
|
+
flat_tasks = {}
|
|
412
|
+
for key, value in result.items():
|
|
413
|
+
if isinstance(value, dict):
|
|
414
|
+
# Nested group - add all subtasks
|
|
415
|
+
flat_tasks.update(value)
|
|
416
|
+
else:
|
|
417
|
+
# Direct task
|
|
418
|
+
flat_tasks[key] = value
|
|
419
|
+
return flat_tasks if flat_tasks else result
|
|
420
|
+
return result
|
|
421
|
+
|
|
422
|
+
# If not found directly, might be the first (and only) key in task_dict
|
|
423
|
+
if len(task_dict) == 1:
|
|
424
|
+
key, value = list(task_dict.items())[0]
|
|
425
|
+
# Check if the key's name matches what we're looking for
|
|
426
|
+
if hasattr(key, 'group') and key.group == lm_eval_task_name:
|
|
427
|
+
if isinstance(value, dict):
|
|
428
|
+
# Flatten nested groups
|
|
429
|
+
flat_tasks = {}
|
|
430
|
+
for k, v in value.items():
|
|
431
|
+
if isinstance(v, dict):
|
|
432
|
+
flat_tasks.update(v)
|
|
433
|
+
else:
|
|
434
|
+
flat_tasks[k] = v
|
|
435
|
+
return flat_tasks if flat_tasks else value
|
|
436
|
+
return value
|
|
437
|
+
|
|
438
|
+
# Check if this is a group task where get_task_dict returns subtasks directly
|
|
439
|
+
# This handles both cases:
|
|
440
|
+
# - 'arithmetic' returns {'arithmetic_1dc': task, 'arithmetic_2da': task, ...}
|
|
441
|
+
# - 'hendrycks_ethics' returns {'ethics_cm': task, 'ethics_justice': task, ...}
|
|
442
|
+
# Verify that values are actual Task objects to ensure this is a valid group task
|
|
443
|
+
if task_dict and len(task_dict) > 0:
|
|
444
|
+
from lm_eval.api.task import Task
|
|
445
|
+
# Check if at least one value is a Task object
|
|
446
|
+
if any(isinstance(v, Task) for v in task_dict.values()):
|
|
447
|
+
log.info(f"Task '{lm_eval_task_name}' is a group task with {len(task_dict)} subtasks: {list(task_dict.keys())}")
|
|
448
|
+
return task_dict
|
|
449
|
+
|
|
450
|
+
raise DataLoaderError(f"lm-eval task '{lm_eval_task_name}' not found (requested as '{task_name}').")
|
|
451
|
+
|
|
452
|
+
def _split_pairs(
|
|
453
|
+
self,
|
|
454
|
+
pairs: list[ContrastivePair],
|
|
455
|
+
split_ratio: float,
|
|
456
|
+
seed: int,
|
|
457
|
+
training_limit: int | None,
|
|
458
|
+
testing_limit: int | None,
|
|
459
|
+
) -> tuple[list[ContrastivePair], list[ContrastivePair]]:
|
|
460
|
+
"""
|
|
461
|
+
Split a list of ContrastivePairs into train/test sets.
|
|
462
|
+
|
|
463
|
+
arguments:
|
|
464
|
+
pairs: List of ContrastivePair to split.
|
|
465
|
+
split_ratio: Float in [0.0, 1.0] for the training set proportion.
|
|
466
|
+
seed: Random seed for shuffling.
|
|
467
|
+
training_limit: Optional max number of training pairs.
|
|
468
|
+
testing_limit: Optional max number of testing pairs.
|
|
469
|
+
|
|
470
|
+
returns:
|
|
471
|
+
A tuple of (train_pairs, test_pairs).
|
|
472
|
+
raises:
|
|
473
|
+
ValueError if split_ratio is not in [0.0, 1.0].
|
|
474
|
+
"""
|
|
475
|
+
if not pairs:
|
|
476
|
+
return [], []
|
|
477
|
+
from numpy.random import default_rng
|
|
478
|
+
|
|
479
|
+
idx = list(range(len(pairs)))
|
|
480
|
+
default_rng(seed).shuffle(idx)
|
|
481
|
+
cut = int(len(pairs) * split_ratio)
|
|
482
|
+
train_idx = set(idx[:cut])
|
|
483
|
+
|
|
484
|
+
train_pairs: list[ContrastivePair] = []
|
|
485
|
+
test_pairs: list[ContrastivePair] = []
|
|
486
|
+
for i in idx:
|
|
487
|
+
(train_pairs if i in train_idx else test_pairs).append(pairs[i])
|
|
488
|
+
|
|
489
|
+
if training_limit and training_limit > 0:
|
|
490
|
+
train_pairs = train_pairs[:training_limit]
|
|
491
|
+
if testing_limit and testing_limit > 0:
|
|
492
|
+
test_pairs = test_pairs[:testing_limit]
|
|
493
|
+
|
|
494
|
+
return train_pairs, test_pairs
|