wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +9 -5
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +256 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +5 -7
- wisent/core/cli/agent/train_classifier.py +19 -7
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -87,12 +87,10 @@ class FlanExtractor(LMEvalBenchmarkExtractor):
|
|
|
87
87
|
correct = str(choices[answer_idx]).strip()
|
|
88
88
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
89
89
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
90
|
-
|
|
91
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
92
90
|
metadata = {"label": "flan"}
|
|
93
91
|
|
|
94
92
|
return self._build_pair(
|
|
95
|
-
question=
|
|
93
|
+
question=question,
|
|
96
94
|
correct=correct,
|
|
97
95
|
incorrect=incorrect,
|
|
98
96
|
metadata=metadata,
|
|
@@ -173,14 +173,12 @@ class FrenchBenchExtractor(LMEvalBenchmarkExtractor):
|
|
|
173
173
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
174
174
|
incorrect = choices[incorrect_idx]
|
|
175
175
|
|
|
176
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
177
|
-
|
|
178
176
|
metadata = {
|
|
179
177
|
"label": "french_bench",
|
|
180
178
|
}
|
|
181
179
|
|
|
182
180
|
return self._build_pair(
|
|
183
|
-
question=
|
|
181
|
+
question=question,
|
|
184
182
|
correct=correct,
|
|
185
183
|
incorrect=incorrect,
|
|
186
184
|
metadata=metadata,
|
|
@@ -126,14 +126,12 @@ class GalicianBenchExtractor(LMEvalBenchmarkExtractor):
|
|
|
126
126
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
127
127
|
incorrect = choices[incorrect_idx]
|
|
128
128
|
|
|
129
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
130
|
-
|
|
131
129
|
metadata = {
|
|
132
130
|
"label": "galician_bench",
|
|
133
131
|
}
|
|
134
132
|
|
|
135
133
|
return self._build_pair(
|
|
136
|
-
question=
|
|
134
|
+
question=question,
|
|
137
135
|
correct=correct,
|
|
138
136
|
incorrect=incorrect,
|
|
139
137
|
metadata=metadata,
|
|
@@ -109,12 +109,12 @@ class GaokaoExtractor(LMEvalBenchmarkExtractor):
|
|
|
109
109
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
110
110
|
incorrect = choices[incorrect_idx]
|
|
111
111
|
|
|
112
|
-
|
|
112
|
+
prompt = f"Question: {query}"
|
|
113
113
|
|
|
114
114
|
metadata = {"label": "gaokao"}
|
|
115
115
|
|
|
116
116
|
return self._build_pair(
|
|
117
|
-
question=
|
|
117
|
+
question=prompt,
|
|
118
118
|
correct=correct,
|
|
119
119
|
incorrect=incorrect,
|
|
120
120
|
metadata=metadata,
|
|
@@ -91,12 +91,10 @@ class GlianorexExtractor(LMEvalBenchmarkExtractor):
|
|
|
91
91
|
correct = str(choices[answer_idx]).strip()
|
|
92
92
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
93
93
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
94
|
-
|
|
95
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
96
94
|
metadata = {"label": "glianorex"}
|
|
97
95
|
|
|
98
96
|
return self._build_pair(
|
|
99
|
-
question=
|
|
97
|
+
question=question,
|
|
100
98
|
correct=correct,
|
|
101
99
|
incorrect=incorrect,
|
|
102
100
|
metadata=metadata,
|
|
@@ -142,14 +142,12 @@ class GlobalMmluExtractor(LMEvalBenchmarkExtractor):
|
|
|
142
142
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
143
143
|
incorrect = choices[incorrect_idx]
|
|
144
144
|
|
|
145
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
146
|
-
|
|
147
145
|
metadata = {
|
|
148
146
|
"label": "global_mmlu",
|
|
149
147
|
}
|
|
150
148
|
|
|
151
149
|
return self._build_pair(
|
|
152
|
-
question=
|
|
150
|
+
question=question,
|
|
153
151
|
correct=correct,
|
|
154
152
|
incorrect=incorrect,
|
|
155
153
|
metadata=metadata,
|
|
@@ -123,14 +123,12 @@ class GlobalPiqaExtractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "global_piqa",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -83,12 +83,10 @@ class Gpt3Extractor(LMEvalBenchmarkExtractor):
|
|
|
83
83
|
correct = str(choices[answer_idx]).strip()
|
|
84
84
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
85
85
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
86
|
-
|
|
87
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
88
86
|
metadata = {"label": "gpt3"}
|
|
89
87
|
|
|
90
88
|
return self._build_pair(
|
|
91
|
-
question=
|
|
89
|
+
question=question,
|
|
92
90
|
correct=correct,
|
|
93
91
|
incorrect=incorrect,
|
|
94
92
|
metadata=metadata,
|
|
@@ -155,14 +155,12 @@ class GroundcocoaExtractor(LMEvalBenchmarkExtractor):
|
|
|
155
155
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
156
156
|
incorrect = choices[incorrect_idx]
|
|
157
157
|
|
|
158
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
159
|
-
|
|
160
158
|
metadata = {
|
|
161
159
|
"label": "groundcocoa",
|
|
162
160
|
}
|
|
163
161
|
|
|
164
162
|
return self._build_pair(
|
|
165
|
-
question=
|
|
163
|
+
question=question,
|
|
166
164
|
correct=correct,
|
|
167
165
|
incorrect=incorrect,
|
|
168
166
|
metadata=metadata,
|
|
@@ -123,14 +123,12 @@ class HaeraeExtractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "haerae",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -83,14 +83,14 @@ class HeadQAExtractor(LMEvalBenchmarkExtractor):
|
|
|
83
83
|
correct = answers[answer_idx]
|
|
84
84
|
incorrect = answers[(answer_idx+1)%len(answers)]
|
|
85
85
|
|
|
86
|
-
|
|
86
|
+
prompt = f"Question: {qtext}\nAnswer:"
|
|
87
87
|
|
|
88
88
|
metadata = {
|
|
89
89
|
"label": "headqa",
|
|
90
90
|
}
|
|
91
91
|
|
|
92
92
|
return self._build_pair(
|
|
93
|
-
question=
|
|
93
|
+
question=prompt,
|
|
94
94
|
correct=correct,
|
|
95
95
|
incorrect=incorrect,
|
|
96
96
|
metadata=metadata,
|
|
@@ -96,14 +96,14 @@ class HellaSwagExtractor(LMEvalBenchmarkExtractor):
|
|
|
96
96
|
incorrect = max(incorrect_endings, key=len) if incorrect_endings else endings[(label+1)%len(endings)]
|
|
97
97
|
|
|
98
98
|
question = f"{query}"
|
|
99
|
-
|
|
99
|
+
prompt = f"{question}"
|
|
100
100
|
|
|
101
101
|
metadata = {
|
|
102
102
|
"label": "hellaswag",
|
|
103
103
|
}
|
|
104
104
|
|
|
105
105
|
return self._build_pair(
|
|
106
|
-
question=
|
|
106
|
+
question=prompt,
|
|
107
107
|
correct=correct,
|
|
108
108
|
incorrect=incorrect,
|
|
109
109
|
metadata=metadata,
|
|
@@ -122,12 +122,10 @@ class HendrycksEthicsExtractor(LMEvalBenchmarkExtractor):
|
|
|
122
122
|
if not activity or not baseline:
|
|
123
123
|
return None
|
|
124
124
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
correct = "A"
|
|
130
|
-
incorrect = "B"
|
|
125
|
+
# Raw prompt - activity is correct, baseline is incorrect
|
|
126
|
+
question = "Which action results in greater overall happiness?"
|
|
127
|
+
correct = activity
|
|
128
|
+
incorrect = baseline
|
|
131
129
|
|
|
132
130
|
metadata = {"label": "hendrycks_ethics"}
|
|
133
131
|
|
|
@@ -196,14 +194,12 @@ class HendrycksEthicsExtractor(LMEvalBenchmarkExtractor):
|
|
|
196
194
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
197
195
|
incorrect = choices[incorrect_idx]
|
|
198
196
|
|
|
199
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
200
|
-
|
|
201
197
|
metadata = {
|
|
202
198
|
"label": "hendrycks_ethics",
|
|
203
199
|
}
|
|
204
200
|
|
|
205
201
|
return self._build_pair(
|
|
206
|
-
question=
|
|
202
|
+
question=question,
|
|
207
203
|
correct=correct,
|
|
208
204
|
incorrect=incorrect,
|
|
209
205
|
metadata=metadata,
|
|
@@ -138,39 +138,86 @@ class HendrycksMathExtractor(LMEvalBenchmarkExtractor):
|
|
|
138
138
|
|
|
139
139
|
return None
|
|
140
140
|
|
|
141
|
-
def _create_incorrect_answer(self, correct: str) -> str:
|
|
141
|
+
def _create_incorrect_answer(self, correct: str, doc: dict = None) -> str:
|
|
142
142
|
"""
|
|
143
|
-
Create
|
|
143
|
+
Create a meaningful incorrect answer by using different plausible wrong values.
|
|
144
|
+
|
|
145
|
+
Strategy:
|
|
146
|
+
1. For integers: use a different integer (multiply by 2, subtract, etc.)
|
|
147
|
+
2. For fractions: change numerator/denominator in a plausible way
|
|
148
|
+
3. For expressions: provide a structurally different but plausible answer
|
|
144
149
|
|
|
145
150
|
Args:
|
|
146
151
|
correct: The correct answer
|
|
152
|
+
doc: Optional doc for context
|
|
147
153
|
|
|
148
154
|
Returns:
|
|
149
|
-
|
|
155
|
+
A plausible but incorrect answer
|
|
150
156
|
"""
|
|
151
|
-
|
|
157
|
+
import random
|
|
158
|
+
random.seed(hash(correct) % (2**32)) # Deterministic based on answer
|
|
159
|
+
|
|
160
|
+
# Try to parse as number and create plausible wrong answer
|
|
152
161
|
try:
|
|
153
|
-
# Remove common LaTeX/math formatting
|
|
154
162
|
clean = correct.replace('$', '').replace(',', '').replace('^\\circ', '').replace('^{\\circ}', '').strip()
|
|
155
163
|
|
|
156
164
|
# Try integer
|
|
157
165
|
num = int(clean)
|
|
158
|
-
|
|
166
|
+
# Use various wrong transformations
|
|
167
|
+
wrong_transforms = [
|
|
168
|
+
num * 2, # doubled
|
|
169
|
+
num // 2 if num > 1 else num * 3, # halved or tripled
|
|
170
|
+
num - 1 if num > 0 else num + 2, # off by different amount
|
|
171
|
+
num + 10, # significantly different
|
|
172
|
+
abs(num) * -1 if num > 0 else abs(num), # sign flip
|
|
173
|
+
]
|
|
174
|
+
return str(random.choice(wrong_transforms))
|
|
159
175
|
except ValueError:
|
|
160
176
|
try:
|
|
161
177
|
# Try float
|
|
162
178
|
num = float(clean)
|
|
163
|
-
|
|
179
|
+
wrong_transforms = [
|
|
180
|
+
num * 2,
|
|
181
|
+
num / 2,
|
|
182
|
+
num - 0.5,
|
|
183
|
+
num + 0.25,
|
|
184
|
+
round(num) if num != round(num) else num + 0.5,
|
|
185
|
+
]
|
|
186
|
+
return str(random.choice(wrong_transforms))
|
|
164
187
|
except ValueError:
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
#
|
|
173
|
-
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
# For fractions like \frac{8}{17}, create plausible wrong fraction
|
|
191
|
+
frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
|
|
192
|
+
if frac_match:
|
|
193
|
+
num, denom = int(frac_match.group(1)), int(frac_match.group(2))
|
|
194
|
+
wrong_fracs = [
|
|
195
|
+
f"\\frac{{{denom}}}{{{num}}}", # inverted
|
|
196
|
+
f"\\frac{{{num}}}{{{denom + 1}}}", # different denominator
|
|
197
|
+
f"\\frac{{{num * 2}}}{{{denom}}}", # doubled numerator
|
|
198
|
+
]
|
|
199
|
+
return random.choice(wrong_fracs)
|
|
200
|
+
|
|
201
|
+
# For sqrt expressions
|
|
202
|
+
sqrt_match = re.search(r'\\sqrt\{(\d+)\}', correct)
|
|
203
|
+
if sqrt_match:
|
|
204
|
+
val = int(sqrt_match.group(1))
|
|
205
|
+
wrong_vals = [val + 1, val - 1 if val > 1 else val + 2, val * 2]
|
|
206
|
+
return correct.replace(f"\\sqrt{{{val}}}", f"\\sqrt{{{random.choice(wrong_vals)}}}")
|
|
207
|
+
|
|
208
|
+
# For pi expressions
|
|
209
|
+
if '\\pi' in correct:
|
|
210
|
+
if '2\\pi' in correct:
|
|
211
|
+
return correct.replace('2\\pi', '\\pi')
|
|
212
|
+
elif '\\pi' in correct:
|
|
213
|
+
return correct.replace('\\pi', '2\\pi')
|
|
214
|
+
|
|
215
|
+
# For other symbolic answers, provide common wrong alternatives
|
|
216
|
+
common_wrong = ['0', '1', '-1', '2', '\\infty', 'undefined']
|
|
217
|
+
if correct not in common_wrong:
|
|
218
|
+
return random.choice([w for w in common_wrong if w != correct])
|
|
219
|
+
|
|
220
|
+
return "incorrect"
|
|
174
221
|
|
|
175
222
|
@staticmethod
|
|
176
223
|
def _build_pair(
|
|
@@ -150,14 +150,12 @@ class HistoiresMoralesExtractor(LMEvalBenchmarkExtractor):
|
|
|
150
150
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
151
151
|
incorrect = choices[incorrect_idx]
|
|
152
152
|
|
|
153
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
154
|
-
|
|
155
153
|
metadata = {
|
|
156
154
|
"label": "histoires_morales",
|
|
157
155
|
}
|
|
158
156
|
|
|
159
157
|
return self._build_pair(
|
|
160
|
-
question=
|
|
158
|
+
question=question,
|
|
161
159
|
correct=correct,
|
|
162
160
|
incorrect=incorrect,
|
|
163
161
|
metadata=metadata,
|
|
@@ -174,14 +174,12 @@ class Hrm8kExtractor(LMEvalBenchmarkExtractor):
|
|
|
174
174
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
175
175
|
incorrect = choices[incorrect_idx]
|
|
176
176
|
|
|
177
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
178
|
-
|
|
179
177
|
metadata = {
|
|
180
178
|
"label": "hrm8k",
|
|
181
179
|
}
|
|
182
180
|
|
|
183
181
|
return self._build_pair(
|
|
184
|
-
question=
|
|
182
|
+
question=question,
|
|
185
183
|
correct=correct,
|
|
186
184
|
incorrect=incorrect,
|
|
187
185
|
metadata=metadata,
|
|
@@ -123,14 +123,12 @@ class HumanevalInfillingExtractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "humaneval_infilling",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -123,14 +123,12 @@ class IcelandicWinograndeExtractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "icelandic_winogrande",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -80,12 +80,10 @@ class InverseExtractor(LMEvalBenchmarkExtractor):
|
|
|
80
80
|
correct = str(choices[answer_idx]).strip()
|
|
81
81
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
82
82
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
83
|
-
|
|
84
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
85
83
|
metadata = {"label": "inverse"}
|
|
86
84
|
|
|
87
85
|
return self._build_pair(
|
|
88
|
-
question=
|
|
86
|
+
question=question,
|
|
89
87
|
correct=correct,
|
|
90
88
|
incorrect=incorrect,
|
|
91
89
|
metadata=metadata,
|
|
@@ -163,14 +163,12 @@ class InverseScalingExtractor(LMEvalBenchmarkExtractor):
|
|
|
163
163
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
164
164
|
incorrect = choices[incorrect_idx]
|
|
165
165
|
|
|
166
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
167
|
-
|
|
168
166
|
metadata = {
|
|
169
167
|
"label": "inverse_scaling",
|
|
170
168
|
}
|
|
171
169
|
|
|
172
170
|
return self._build_pair(
|
|
173
|
-
question=
|
|
171
|
+
question=question,
|
|
174
172
|
correct=correct,
|
|
175
173
|
incorrect=incorrect,
|
|
176
174
|
metadata=metadata,
|
|
@@ -80,12 +80,10 @@ class JaExtractor(LMEvalBenchmarkExtractor):
|
|
|
80
80
|
correct = str(choices[answer_idx]).strip()
|
|
81
81
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
82
82
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
83
|
-
|
|
84
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
85
83
|
metadata = {"label": "ja"}
|
|
86
84
|
|
|
87
85
|
return self._build_pair(
|
|
88
|
-
question=
|
|
86
|
+
question=question,
|
|
89
87
|
correct=correct,
|
|
90
88
|
incorrect=incorrect,
|
|
91
89
|
metadata=metadata,
|
|
@@ -126,14 +126,12 @@ class JapaneseLeaderboardExtractor(LMEvalBenchmarkExtractor):
|
|
|
126
126
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
127
127
|
incorrect = choices[incorrect_idx]
|
|
128
128
|
|
|
129
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
130
|
-
|
|
131
129
|
metadata = {
|
|
132
130
|
"label": "japanese_leaderboard",
|
|
133
131
|
}
|
|
134
132
|
|
|
135
133
|
return self._build_pair(
|
|
136
|
-
question=
|
|
134
|
+
question=question,
|
|
137
135
|
correct=correct,
|
|
138
136
|
incorrect=incorrect,
|
|
139
137
|
metadata=metadata,
|
|
@@ -103,7 +103,7 @@ class JapaneseLeaderboardMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
|
|
|
103
103
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
104
104
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
105
105
|
|
|
106
|
-
|
|
106
|
+
prompt = f"Question: {question}"
|
|
107
107
|
|
|
108
108
|
positive_response = PositiveResponse(model_response=correct)
|
|
109
109
|
negative_response = NegativeResponse(model_response=incorrect)
|
|
@@ -139,14 +139,12 @@ class KmmluExtractor(LMEvalBenchmarkExtractor):
|
|
|
139
139
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
140
140
|
incorrect = choices[incorrect_idx]
|
|
141
141
|
|
|
142
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
143
|
-
|
|
144
142
|
metadata = {
|
|
145
143
|
"label": "kmmlu",
|
|
146
144
|
}
|
|
147
145
|
|
|
148
146
|
return self._build_pair(
|
|
149
|
-
question=
|
|
147
|
+
question=question,
|
|
150
148
|
correct=correct,
|
|
151
149
|
incorrect=incorrect,
|
|
152
150
|
metadata=metadata,
|
|
@@ -136,14 +136,12 @@ class KobestExtractor(LMEvalBenchmarkExtractor):
|
|
|
136
136
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
137
137
|
incorrect = choices[incorrect_idx]
|
|
138
138
|
|
|
139
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
140
|
-
|
|
141
139
|
metadata = {
|
|
142
140
|
"label": "kobest",
|
|
143
141
|
}
|
|
144
142
|
|
|
145
143
|
return self._build_pair(
|
|
146
|
-
question=
|
|
144
|
+
question=question,
|
|
147
145
|
correct=correct,
|
|
148
146
|
incorrect=incorrect,
|
|
149
147
|
metadata=metadata,
|
|
@@ -118,29 +118,17 @@ class KormedmcqaExtractor(LMEvalBenchmarkExtractor):
|
|
|
118
118
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
119
119
|
incorrect = choices[incorrect_idx]
|
|
120
120
|
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
f"{question}\n"
|
|
124
|
-
f"A. {choices[0]}\n"
|
|
125
|
-
f"B. {choices[1]}\n"
|
|
126
|
-
f"C. {choices[2]}\n"
|
|
127
|
-
f"D. {choices[3]}\n"
|
|
128
|
-
f"E. {choices[4]}\n"
|
|
129
|
-
f"정답:"
|
|
130
|
-
)
|
|
121
|
+
# Raw prompt without MC formatting
|
|
122
|
+
prompt = question
|
|
131
123
|
|
|
132
124
|
metadata = {
|
|
133
125
|
"label": "kormedmcqa",
|
|
134
126
|
}
|
|
135
127
|
|
|
136
|
-
# The correct answer is the letter (A-E)
|
|
137
|
-
correct_letter = chr(ord('A') + answer_idx)
|
|
138
|
-
incorrect_letter = chr(ord('A') + incorrect_idx)
|
|
139
|
-
|
|
140
128
|
return self._build_pair(
|
|
141
|
-
question=
|
|
142
|
-
correct=
|
|
143
|
-
incorrect=
|
|
129
|
+
question=prompt,
|
|
130
|
+
correct=correct,
|
|
131
|
+
incorrect=incorrect,
|
|
144
132
|
metadata=metadata,
|
|
145
133
|
)
|
|
146
134
|
|
|
@@ -156,14 +156,12 @@ class LambadaClozeExtractor(LMEvalBenchmarkExtractor):
|
|
|
156
156
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
157
157
|
incorrect = choices[incorrect_idx]
|
|
158
158
|
|
|
159
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
160
|
-
|
|
161
159
|
metadata = {
|
|
162
160
|
"label": "lambada_cloze",
|
|
163
161
|
}
|
|
164
162
|
|
|
165
163
|
return self._build_pair(
|
|
166
|
-
question=
|
|
164
|
+
question=question,
|
|
167
165
|
correct=correct,
|
|
168
166
|
incorrect=incorrect,
|
|
169
167
|
metadata=metadata,
|
|
@@ -156,14 +156,12 @@ class LambadaMultilingualExtractor(LMEvalBenchmarkExtractor):
|
|
|
156
156
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
157
157
|
incorrect = choices[incorrect_idx]
|
|
158
158
|
|
|
159
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
160
|
-
|
|
161
159
|
metadata = {
|
|
162
160
|
"label": "lambada_multilingual",
|
|
163
161
|
}
|
|
164
162
|
|
|
165
163
|
return self._build_pair(
|
|
166
|
-
question=
|
|
164
|
+
question=question,
|
|
167
165
|
correct=correct,
|
|
168
166
|
incorrect=incorrect,
|
|
169
167
|
metadata=metadata,
|
|
@@ -80,12 +80,10 @@ class LawExtractor(LMEvalBenchmarkExtractor):
|
|
|
80
80
|
correct = str(choices[answer_idx]).strip()
|
|
81
81
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
82
82
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
83
|
-
|
|
84
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
85
83
|
metadata = {"label": "law"}
|
|
86
84
|
|
|
87
85
|
return self._build_pair(
|
|
88
|
-
question=
|
|
86
|
+
question=question,
|
|
89
87
|
correct=correct,
|
|
90
88
|
incorrect=incorrect,
|
|
91
89
|
metadata=metadata,
|
|
@@ -165,14 +165,12 @@ class LeaderboardExtractor(LMEvalBenchmarkExtractor):
|
|
|
165
165
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
166
166
|
incorrect = choices[incorrect_idx]
|
|
167
167
|
|
|
168
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
169
|
-
|
|
170
168
|
metadata = {
|
|
171
169
|
"label": "leaderboard",
|
|
172
170
|
}
|
|
173
171
|
|
|
174
172
|
return self._build_pair(
|
|
175
|
-
question=
|
|
173
|
+
question=question,
|
|
176
174
|
correct=correct,
|
|
177
175
|
incorrect=incorrect,
|
|
178
176
|
metadata=metadata,
|
|
@@ -174,14 +174,12 @@ class LingolyExtractor(LMEvalBenchmarkExtractor):
|
|
|
174
174
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
175
175
|
incorrect = choices[incorrect_idx]
|
|
176
176
|
|
|
177
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
178
|
-
|
|
179
177
|
metadata = {
|
|
180
178
|
"label": "lingoly",
|
|
181
179
|
}
|
|
182
180
|
|
|
183
181
|
return self._build_pair(
|
|
184
|
-
question=
|
|
182
|
+
question=question,
|
|
185
183
|
correct=correct,
|
|
186
184
|
incorrect=incorrect,
|
|
187
185
|
metadata=metadata,
|