wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +9 -5
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +256 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +5 -7
- wisent/core/cli/agent/train_classifier.py +19 -7
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -123,14 +123,12 @@ class Llama3Extractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "llama3",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -123,14 +123,12 @@ class LmSynevalExtractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "lm_syneval",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -86,14 +86,14 @@ class LogiQAExtractor(LMEvalBenchmarkExtractor):
|
|
|
86
86
|
incorrect = options[(label_idx+1)%len(options)]
|
|
87
87
|
|
|
88
88
|
question = f"{question}"
|
|
89
|
-
|
|
89
|
+
prompt = f"Passage: {context}\nQuestion: {question}"
|
|
90
90
|
|
|
91
91
|
metadata = {
|
|
92
92
|
"label": "logiqa",
|
|
93
93
|
}
|
|
94
94
|
|
|
95
95
|
return self._build_pair(
|
|
96
|
-
question=
|
|
96
|
+
question=prompt,
|
|
97
97
|
correct=correct,
|
|
98
98
|
incorrect=incorrect,
|
|
99
99
|
metadata=metadata,
|
|
@@ -85,14 +85,14 @@ class LogiQA2Extractor(LMEvalBenchmarkExtractor):
|
|
|
85
85
|
incorrect = options[(answer+1)%len(options)]
|
|
86
86
|
|
|
87
87
|
question = f"{question}"
|
|
88
|
-
|
|
88
|
+
prompt = f"Passage: {text}\nQuestion: {question}"
|
|
89
89
|
|
|
90
90
|
metadata = {
|
|
91
91
|
"label": "logiqa2",
|
|
92
92
|
}
|
|
93
93
|
|
|
94
94
|
return self._build_pair(
|
|
95
|
-
question=
|
|
95
|
+
question=prompt,
|
|
96
96
|
correct=correct,
|
|
97
97
|
incorrect=incorrect,
|
|
98
98
|
metadata=metadata,
|
|
@@ -123,14 +123,12 @@ class LongbenchExtractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "longbench",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -123,14 +123,12 @@ class Longbenchv2Extractor(LMEvalBenchmarkExtractor):
|
|
|
123
123
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
124
124
|
incorrect = choices[incorrect_idx]
|
|
125
125
|
|
|
126
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
127
|
-
|
|
128
126
|
metadata = {
|
|
129
127
|
"label": "longbenchv2",
|
|
130
128
|
}
|
|
131
129
|
|
|
132
130
|
return self._build_pair(
|
|
133
|
-
question=
|
|
131
|
+
question=question,
|
|
134
132
|
correct=correct,
|
|
135
133
|
incorrect=incorrect,
|
|
136
134
|
metadata=metadata,
|
|
@@ -115,7 +115,7 @@ class MastermindExtractor(LMEvalBenchmarkExtractor):
|
|
|
115
115
|
}
|
|
116
116
|
|
|
117
117
|
return self._build_pair(
|
|
118
|
-
question=
|
|
118
|
+
question=question,
|
|
119
119
|
correct=correct,
|
|
120
120
|
incorrect=incorrect,
|
|
121
121
|
metadata=metadata,
|
|
@@ -174,14 +174,12 @@ class MastermindExtractor(LMEvalBenchmarkExtractor):
|
|
|
174
174
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
175
175
|
incorrect = choices[incorrect_idx]
|
|
176
176
|
|
|
177
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
178
|
-
|
|
179
177
|
metadata = {
|
|
180
178
|
"label": "mastermind",
|
|
181
179
|
}
|
|
182
180
|
|
|
183
181
|
return self._build_pair(
|
|
184
|
-
question=
|
|
182
|
+
question=question,
|
|
185
183
|
correct=correct,
|
|
186
184
|
incorrect=incorrect,
|
|
187
185
|
metadata=metadata,
|
|
@@ -83,7 +83,7 @@ class MCTACOExtractor(LMEvalBenchmarkExtractor):
|
|
|
83
83
|
)
|
|
84
84
|
return None
|
|
85
85
|
|
|
86
|
-
|
|
86
|
+
prompt = f"{sentence}\nQuestion: {question}\nAnswer: {answer}\nPlausible?"
|
|
87
87
|
|
|
88
88
|
correct = "Yes" if label == 1 else "No"
|
|
89
89
|
incorrect = "No" if label == 1 else "Yes"
|
|
@@ -93,7 +93,7 @@ class MCTACOExtractor(LMEvalBenchmarkExtractor):
|
|
|
93
93
|
}
|
|
94
94
|
|
|
95
95
|
return self._build_pair(
|
|
96
|
-
question=
|
|
96
|
+
question=prompt,
|
|
97
97
|
correct=correct,
|
|
98
98
|
incorrect=incorrect,
|
|
99
99
|
metadata=metadata,
|
|
@@ -131,7 +131,7 @@ class MedConceptsQaExtractor(LMEvalBenchmarkExtractor):
|
|
|
131
131
|
|
|
132
132
|
# For this format, the response should be just the letter
|
|
133
133
|
return self._build_pair(
|
|
134
|
-
question=
|
|
134
|
+
question=question,
|
|
135
135
|
correct=answer_key,
|
|
136
136
|
incorrect=chr(ord('A') + incorrect_idx),
|
|
137
137
|
metadata=metadata,
|
|
@@ -195,14 +195,12 @@ class MedConceptsQaExtractor(LMEvalBenchmarkExtractor):
|
|
|
195
195
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
196
196
|
incorrect = choices[incorrect_idx]
|
|
197
197
|
|
|
198
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
199
|
-
|
|
200
198
|
metadata = {
|
|
201
199
|
"label": "med_concepts_qa",
|
|
202
200
|
}
|
|
203
201
|
|
|
204
202
|
return self._build_pair(
|
|
205
|
-
question=
|
|
203
|
+
question=question,
|
|
206
204
|
correct=correct,
|
|
207
205
|
incorrect=incorrect,
|
|
208
206
|
metadata=metadata,
|
|
@@ -151,14 +151,12 @@ class MeddialogExtractor(LMEvalBenchmarkExtractor):
|
|
|
151
151
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
152
152
|
incorrect = choices[incorrect_idx]
|
|
153
153
|
|
|
154
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
155
|
-
|
|
156
154
|
metadata = {
|
|
157
155
|
"label": "meddialog",
|
|
158
156
|
}
|
|
159
157
|
|
|
160
158
|
return self._build_pair(
|
|
161
|
-
question=
|
|
159
|
+
question=question,
|
|
162
160
|
correct=correct,
|
|
163
161
|
incorrect=incorrect,
|
|
164
162
|
metadata=metadata,
|
|
@@ -80,12 +80,10 @@ class MedicalExtractor(LMEvalBenchmarkExtractor):
|
|
|
80
80
|
correct = str(choices[answer_idx]).strip()
|
|
81
81
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
82
82
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
83
|
-
|
|
84
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
85
83
|
metadata = {"label": "medical"}
|
|
86
84
|
|
|
87
85
|
return self._build_pair(
|
|
88
|
-
question=
|
|
86
|
+
question=question,
|
|
89
87
|
correct=correct,
|
|
90
88
|
incorrect=incorrect,
|
|
91
89
|
metadata=metadata,
|
|
@@ -140,14 +140,12 @@ class MedmcqaExtractor(LMEvalBenchmarkExtractor):
|
|
|
140
140
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
141
141
|
incorrect = choices[incorrect_idx]
|
|
142
142
|
|
|
143
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
144
|
-
|
|
145
143
|
metadata = {
|
|
146
144
|
"label": "medmcqa",
|
|
147
145
|
}
|
|
148
146
|
|
|
149
147
|
return self._build_pair(
|
|
150
|
-
question=
|
|
148
|
+
question=question,
|
|
151
149
|
correct=correct,
|
|
152
150
|
incorrect=incorrect,
|
|
153
151
|
metadata=metadata,
|
|
@@ -89,14 +89,14 @@ class MedQAExtractor(LMEvalBenchmarkExtractor):
|
|
|
89
89
|
correct = endings[label]
|
|
90
90
|
incorrect = endings[(label + 1) % 4]
|
|
91
91
|
|
|
92
|
-
|
|
92
|
+
prompt = f"Question: {sent1}"
|
|
93
93
|
|
|
94
94
|
metadata = {
|
|
95
95
|
"label": "medqa",
|
|
96
96
|
}
|
|
97
97
|
|
|
98
98
|
return self._build_pair(
|
|
99
|
-
question=
|
|
99
|
+
question=prompt,
|
|
100
100
|
correct=correct,
|
|
101
101
|
incorrect=incorrect,
|
|
102
102
|
metadata=metadata,
|
|
@@ -67,11 +67,11 @@ class MelaExtractor(LMEvalBenchmarkExtractor):
|
|
|
67
67
|
incorrect_idx = 1 - answer_idx
|
|
68
68
|
incorrect = choices[incorrect_idx]
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
prompt = f"Sentence: {sentence}\nDetermine whether this sentence is acceptable or unacceptable?"
|
|
71
71
|
metadata = {"label": "mela"}
|
|
72
72
|
|
|
73
73
|
return self._build_pair(
|
|
74
|
-
question=
|
|
74
|
+
question=prompt,
|
|
75
75
|
correct=correct,
|
|
76
76
|
incorrect=incorrect,
|
|
77
77
|
metadata=metadata,
|
|
@@ -125,14 +125,12 @@ class MetabenchExtractor(LMEvalBenchmarkExtractor):
|
|
|
125
125
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
126
126
|
incorrect = choices[incorrect_idx]
|
|
127
127
|
|
|
128
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
129
|
-
|
|
130
128
|
metadata = {
|
|
131
129
|
"label": "metabench",
|
|
132
130
|
}
|
|
133
131
|
|
|
134
132
|
return self._build_pair(
|
|
135
|
-
question=
|
|
133
|
+
question=question,
|
|
136
134
|
correct=correct,
|
|
137
135
|
incorrect=incorrect,
|
|
138
136
|
metadata=metadata,
|
|
@@ -143,14 +143,12 @@ class MinervaMathExtractor(LMEvalBenchmarkExtractor):
|
|
|
143
143
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
144
144
|
incorrect = choices[incorrect_idx]
|
|
145
145
|
|
|
146
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
147
|
-
|
|
148
146
|
metadata = {
|
|
149
147
|
"label": "minerva_math",
|
|
150
148
|
}
|
|
151
149
|
|
|
152
150
|
return self._build_pair(
|
|
153
|
-
question=
|
|
151
|
+
question=question,
|
|
154
152
|
correct=correct,
|
|
155
153
|
incorrect=incorrect,
|
|
156
154
|
metadata=metadata,
|
|
@@ -115,14 +115,12 @@ class MMLUExtractor(LMEvalBenchmarkExtractor):
|
|
|
115
115
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
116
116
|
incorrect = choices[incorrect_idx]
|
|
117
117
|
|
|
118
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
119
|
-
|
|
120
118
|
metadata = {
|
|
121
119
|
"label": "mmlu",
|
|
122
120
|
}
|
|
123
121
|
|
|
124
122
|
return self._build_pair(
|
|
125
|
-
question=
|
|
123
|
+
question=question,
|
|
126
124
|
correct=correct,
|
|
127
125
|
incorrect=incorrect,
|
|
128
126
|
metadata=metadata,
|
|
@@ -154,20 +154,19 @@ class MmlusrExtractor(LMEvalBenchmarkExtractor):
|
|
|
154
154
|
)
|
|
155
155
|
return None
|
|
156
156
|
|
|
157
|
-
# Build prompt
|
|
157
|
+
# Build prompt - raw question without MC formatting
|
|
158
158
|
correct = choices[answer_idx]
|
|
159
159
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
160
160
|
incorrect = choices[incorrect_idx]
|
|
161
161
|
|
|
162
|
-
|
|
163
|
-
formatted_question = f"{question}\nA. {choices[0]}\nB. {choices[1]}\nC. {choices[2]}\nD. {choices[3]}\nAnswer:"
|
|
162
|
+
prompt = question
|
|
164
163
|
|
|
165
164
|
metadata = {
|
|
166
165
|
"label": "mmlusr",
|
|
167
166
|
}
|
|
168
167
|
|
|
169
168
|
return self._build_pair(
|
|
170
|
-
question=
|
|
169
|
+
question=prompt,
|
|
171
170
|
correct=correct,
|
|
172
171
|
incorrect=incorrect,
|
|
173
172
|
metadata=metadata,
|
|
@@ -79,7 +79,7 @@ class MRPCExtractor(LMEvalBenchmarkExtractor):
|
|
|
79
79
|
)
|
|
80
80
|
return None
|
|
81
81
|
|
|
82
|
-
|
|
82
|
+
prompt = f"Sentence 1: {sentence1}\nSentence 2: {sentence2}. Do both sequences mean the same thing?"
|
|
83
83
|
|
|
84
84
|
correct = "Yes" if label == 1 else "No"
|
|
85
85
|
incorrect = "No" if label == 1 else "Yes"
|
|
@@ -89,7 +89,7 @@ class MRPCExtractor(LMEvalBenchmarkExtractor):
|
|
|
89
89
|
}
|
|
90
90
|
|
|
91
91
|
return self._build_pair(
|
|
92
|
-
question=
|
|
92
|
+
question=prompt,
|
|
93
93
|
correct=correct,
|
|
94
94
|
incorrect=incorrect,
|
|
95
95
|
metadata=metadata,
|
|
@@ -100,11 +100,8 @@ class MultiblimpExtractor(LMEvalBenchmarkExtractor):
|
|
|
100
100
|
log.debug("Skipping doc with missing sen/wrong_sen", extra={"doc": doc})
|
|
101
101
|
return None
|
|
102
102
|
|
|
103
|
-
#
|
|
104
|
-
|
|
105
|
-
prompt = "Which sentence is grammatically correct?\nA. {}\nB. {}".format(
|
|
106
|
-
correct_sentence, incorrect_sentence
|
|
107
|
-
)
|
|
103
|
+
# Raw prompt without A./B. formatting
|
|
104
|
+
prompt = "Which sentence is grammatically correct?"
|
|
108
105
|
|
|
109
106
|
metadata = {"label": "multiblimp"}
|
|
110
107
|
|
|
@@ -82,7 +82,7 @@ class MultiRCExtractor(LMEvalBenchmarkExtractor):
|
|
|
82
82
|
)
|
|
83
83
|
return None
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
prompt = f"{paragraph}\nQuestion: {question}\nAnswer: {answer}\nIs this answer correct?"
|
|
86
86
|
|
|
87
87
|
correct = "Yes" if label == 1 else "No"
|
|
88
88
|
incorrect = "No" if label == 1 else "Yes"
|
|
@@ -92,7 +92,7 @@ class MultiRCExtractor(LMEvalBenchmarkExtractor):
|
|
|
92
92
|
}
|
|
93
93
|
|
|
94
94
|
return self._build_pair(
|
|
95
|
-
question=
|
|
95
|
+
question=prompt,
|
|
96
96
|
correct=correct,
|
|
97
97
|
incorrect=incorrect,
|
|
98
98
|
metadata=metadata,
|
|
@@ -84,14 +84,14 @@ class MutualExtractor(LMEvalBenchmarkExtractor):
|
|
|
84
84
|
correct = options[answer_idx]
|
|
85
85
|
incorrect = options[(answer_idx+1)%len(options)]
|
|
86
86
|
|
|
87
|
-
|
|
87
|
+
prompt = article
|
|
88
88
|
|
|
89
89
|
metadata = {
|
|
90
90
|
"label": "mutual",
|
|
91
91
|
}
|
|
92
92
|
|
|
93
93
|
return self._build_pair(
|
|
94
|
-
question=
|
|
94
|
+
question=prompt,
|
|
95
95
|
correct=correct,
|
|
96
96
|
incorrect=incorrect,
|
|
97
97
|
metadata=metadata,
|
|
@@ -80,12 +80,10 @@ class NonExtractor(LMEvalBenchmarkExtractor):
|
|
|
80
80
|
correct = str(choices[answer_idx]).strip()
|
|
81
81
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
82
82
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
83
|
-
|
|
84
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
85
83
|
metadata = {"label": "non"}
|
|
86
84
|
|
|
87
85
|
return self._build_pair(
|
|
88
|
-
question=
|
|
86
|
+
question=question,
|
|
89
87
|
correct=correct,
|
|
90
88
|
incorrect=incorrect,
|
|
91
89
|
metadata=metadata,
|
|
@@ -144,14 +144,12 @@ class NorevalExtractor(LMEvalBenchmarkExtractor):
|
|
|
144
144
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
145
145
|
incorrect = choices[incorrect_idx]
|
|
146
146
|
|
|
147
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
148
|
-
|
|
149
147
|
metadata = {
|
|
150
148
|
"label": "noreval",
|
|
151
149
|
}
|
|
152
150
|
|
|
153
151
|
return self._build_pair(
|
|
154
|
-
question=
|
|
152
|
+
question=question,
|
|
155
153
|
correct=correct,
|
|
156
154
|
incorrect=incorrect,
|
|
157
155
|
metadata=metadata,
|
|
@@ -128,14 +128,12 @@ class NorevalExactMatchExtractor(LMEvalBenchmarkExtractor):
|
|
|
128
128
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
129
129
|
incorrect = choices[incorrect_idx]
|
|
130
130
|
|
|
131
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
132
|
-
|
|
133
131
|
metadata = {
|
|
134
132
|
"label": "noreval_exact",
|
|
135
133
|
}
|
|
136
134
|
|
|
137
135
|
return self._build_pair(
|
|
138
|
-
question=
|
|
136
|
+
question=question,
|
|
139
137
|
correct=correct,
|
|
140
138
|
incorrect=incorrect,
|
|
141
139
|
metadata=metadata,
|
|
@@ -136,14 +136,12 @@ class NorevalGenerationExactMatchExtractor(LMEvalBenchmarkExtractor):
|
|
|
136
136
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
137
137
|
incorrect = choices[incorrect_idx]
|
|
138
138
|
|
|
139
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
140
|
-
|
|
141
139
|
metadata = {
|
|
142
140
|
"label": "noreval_gen_exact",
|
|
143
141
|
}
|
|
144
142
|
|
|
145
143
|
return self._build_pair(
|
|
146
|
-
question=
|
|
144
|
+
question=question,
|
|
147
145
|
correct=correct,
|
|
148
146
|
incorrect=incorrect,
|
|
149
147
|
metadata=metadata,
|
|
@@ -107,12 +107,12 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
|
|
|
107
107
|
log.debug("Skipping doc due to empty correct/wrong fields", extra={"doc": doc})
|
|
108
108
|
return None
|
|
109
109
|
|
|
110
|
-
|
|
110
|
+
prompt = f"Which sentence is grammatically correct?"
|
|
111
111
|
|
|
112
112
|
metadata = {"label": "noreval_ncb"}
|
|
113
113
|
|
|
114
114
|
return self._build_pair(
|
|
115
|
-
question=
|
|
115
|
+
question=question,
|
|
116
116
|
correct=correct,
|
|
117
117
|
incorrect=incorrect,
|
|
118
118
|
metadata=metadata,
|
|
@@ -151,12 +151,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
|
|
|
151
151
|
correct = str(choices[correct_idx]).strip()
|
|
152
152
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
153
153
|
|
|
154
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
155
|
-
|
|
156
154
|
metadata = {"label": "noreval_truthfulqa"}
|
|
157
155
|
|
|
158
156
|
return self._build_pair(
|
|
159
|
-
question=
|
|
157
|
+
question=question,
|
|
160
158
|
correct=correct,
|
|
161
159
|
incorrect=incorrect,
|
|
162
160
|
metadata=metadata,
|
|
@@ -192,12 +190,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
|
|
|
192
190
|
incorrect_idx = (answer_idx + 1) % len(choice_texts)
|
|
193
191
|
incorrect = str(choice_texts[incorrect_idx]).strip()
|
|
194
192
|
|
|
195
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
196
|
-
|
|
197
193
|
metadata = {"label": "noreval_nrk_quiz"}
|
|
198
194
|
|
|
199
195
|
return self._build_pair(
|
|
200
|
-
question=
|
|
196
|
+
question=question,
|
|
201
197
|
correct=correct,
|
|
202
198
|
incorrect=incorrect,
|
|
203
199
|
metadata=metadata,
|
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py
CHANGED
|
@@ -102,12 +102,12 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
|
|
|
102
102
|
log.debug("Skipping doc due to empty correct/wrong fields", extra={"doc": doc})
|
|
103
103
|
return None
|
|
104
104
|
|
|
105
|
-
|
|
105
|
+
prompt = f"Which sentence is grammatically correct?"
|
|
106
106
|
|
|
107
107
|
metadata = {"label": "noreval_ncb"}
|
|
108
108
|
|
|
109
109
|
return self._build_pair(
|
|
110
|
-
question=
|
|
110
|
+
question=question,
|
|
111
111
|
correct=correct,
|
|
112
112
|
incorrect=incorrect,
|
|
113
113
|
metadata=metadata,
|
|
@@ -146,12 +146,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
|
|
|
146
146
|
correct = str(choices[correct_idx]).strip()
|
|
147
147
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
148
148
|
|
|
149
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
150
|
-
|
|
151
149
|
metadata = {"label": "noreval_truthfulqa"}
|
|
152
150
|
|
|
153
151
|
return self._build_pair(
|
|
154
|
-
question=
|
|
152
|
+
question=question,
|
|
155
153
|
correct=correct,
|
|
156
154
|
incorrect=incorrect,
|
|
157
155
|
metadata=metadata,
|
|
@@ -187,12 +185,10 @@ class NorevalMultipleChoiceExtractor(LMEvalBenchmarkExtractor):
|
|
|
187
185
|
incorrect_idx = (answer_idx + 1) % len(choice_texts)
|
|
188
186
|
incorrect = str(choice_texts[incorrect_idx]).strip()
|
|
189
187
|
|
|
190
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
191
|
-
|
|
192
188
|
metadata = {"label": "noreval_nrk_quiz"}
|
|
193
189
|
|
|
194
190
|
return self._build_pair(
|
|
195
|
-
question=
|
|
191
|
+
question=question,
|
|
196
192
|
correct=correct,
|
|
197
193
|
incorrect=incorrect,
|
|
198
194
|
metadata=metadata,
|
|
@@ -106,14 +106,14 @@ class NQOpenExtractor(LMEvalBenchmarkExtractor):
|
|
|
106
106
|
if incorrect == correct:
|
|
107
107
|
incorrect += "k"
|
|
108
108
|
|
|
109
|
-
|
|
109
|
+
prompt = f"Question: {question}\nAnswer:"
|
|
110
110
|
|
|
111
111
|
metadata = {
|
|
112
112
|
"label": "nq_open",
|
|
113
113
|
}
|
|
114
114
|
|
|
115
115
|
return self._build_pair(
|
|
116
|
-
question=
|
|
116
|
+
question=prompt,
|
|
117
117
|
correct=correct,
|
|
118
118
|
incorrect=incorrect,
|
|
119
119
|
metadata=metadata,
|
|
@@ -138,14 +138,12 @@ class OkapiArcMultilingualExtractor(LMEvalBenchmarkExtractor):
|
|
|
138
138
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
139
139
|
incorrect = choices[incorrect_idx]
|
|
140
140
|
|
|
141
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
142
|
-
|
|
143
141
|
metadata = {
|
|
144
142
|
"label": "okapi/arc_multilingual",
|
|
145
143
|
}
|
|
146
144
|
|
|
147
145
|
return self._build_pair(
|
|
148
|
-
question=
|
|
146
|
+
question=question,
|
|
149
147
|
correct=correct,
|
|
150
148
|
incorrect=incorrect,
|
|
151
149
|
metadata=metadata,
|
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py
CHANGED
|
@@ -145,14 +145,12 @@ class OkapiHellaswagMultilingualExtractor(LMEvalBenchmarkExtractor):
|
|
|
145
145
|
)
|
|
146
146
|
return None
|
|
147
147
|
|
|
148
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
149
|
-
|
|
150
148
|
metadata = {
|
|
151
149
|
"label": "okapi/hellaswag_multilingual",
|
|
152
150
|
}
|
|
153
151
|
|
|
154
152
|
return self._build_pair(
|
|
155
|
-
question=
|
|
153
|
+
question=question,
|
|
156
154
|
correct=correct,
|
|
157
155
|
incorrect=incorrect,
|
|
158
156
|
metadata=metadata,
|
|
@@ -133,14 +133,12 @@ class OkapiMmluMultilingualExtractor(LMEvalBenchmarkExtractor):
|
|
|
133
133
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
134
134
|
incorrect = choices[incorrect_idx]
|
|
135
135
|
|
|
136
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
137
|
-
|
|
138
136
|
metadata = {
|
|
139
137
|
"label": "okapi/mmlu_multilingual",
|
|
140
138
|
}
|
|
141
139
|
|
|
142
140
|
return self._build_pair(
|
|
143
|
-
question=
|
|
141
|
+
question=question,
|
|
144
142
|
correct=correct,
|
|
145
143
|
incorrect=incorrect,
|
|
146
144
|
metadata=metadata,
|