wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +9 -5
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +256 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +5 -7
- wisent/core/cli/agent/train_classifier.py +19 -7
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -168,14 +168,12 @@ class XcopaExtractor(LMEvalBenchmarkExtractor):
|
|
|
168
168
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
169
169
|
incorrect = choices[incorrect_idx]
|
|
170
170
|
|
|
171
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
172
|
-
|
|
173
171
|
metadata = {
|
|
174
172
|
"label": "xcopa",
|
|
175
173
|
}
|
|
176
174
|
|
|
177
175
|
return self._build_pair(
|
|
178
|
-
question=
|
|
176
|
+
question=question,
|
|
179
177
|
correct=correct,
|
|
180
178
|
incorrect=incorrect,
|
|
181
179
|
metadata=metadata,
|
|
@@ -120,12 +120,10 @@ class XlsumExtractor(LMEvalBenchmarkExtractor):
|
|
|
120
120
|
correct = str(choices[answer_idx]).strip()
|
|
121
121
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
122
122
|
incorrect = str(choices[incorrect_idx]).strip()
|
|
123
|
-
|
|
124
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
125
123
|
metadata = {"label": "xlsum"}
|
|
126
124
|
|
|
127
125
|
return self._build_pair(
|
|
128
|
-
question=
|
|
126
|
+
question=question,
|
|
129
127
|
correct=correct,
|
|
130
128
|
incorrect=incorrect,
|
|
131
129
|
metadata=metadata,
|
|
@@ -102,14 +102,14 @@ class XNLIExtractor(LMEvalBenchmarkExtractor):
|
|
|
102
102
|
correct = labels[label]
|
|
103
103
|
incorrect = labels[(label+1)%3]
|
|
104
104
|
|
|
105
|
-
|
|
105
|
+
prompt = f"Decide the relationship of the hypothesis '{hypothesis}' to the premise '{premise}"
|
|
106
106
|
|
|
107
107
|
metadata = {
|
|
108
108
|
"label": "xnli",
|
|
109
109
|
}
|
|
110
110
|
|
|
111
111
|
return self._build_pair(
|
|
112
|
-
question=
|
|
112
|
+
question=prompt,
|
|
113
113
|
correct=correct,
|
|
114
114
|
incorrect=incorrect,
|
|
115
115
|
metadata=metadata,
|
|
@@ -110,7 +110,7 @@ class XquadExtractor(LMEvalBenchmarkExtractor):
|
|
|
110
110
|
|
|
111
111
|
metadata = {"label": "xquad"}
|
|
112
112
|
return self._build_pair(
|
|
113
|
-
question=
|
|
113
|
+
question=question,
|
|
114
114
|
correct=correct_answer,
|
|
115
115
|
incorrect=incorrect_answer,
|
|
116
116
|
metadata=metadata,
|
|
@@ -174,14 +174,12 @@ class XquadExtractor(LMEvalBenchmarkExtractor):
|
|
|
174
174
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
175
175
|
incorrect = choices[incorrect_idx]
|
|
176
176
|
|
|
177
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
178
|
-
|
|
179
177
|
metadata = {
|
|
180
178
|
"label": "xquad",
|
|
181
179
|
}
|
|
182
180
|
|
|
183
181
|
return self._build_pair(
|
|
184
|
-
question=
|
|
182
|
+
question=question,
|
|
185
183
|
correct=correct,
|
|
186
184
|
incorrect=incorrect,
|
|
187
185
|
metadata=metadata,
|
|
@@ -99,15 +99,14 @@ class XStoryClozeExtractor(LMEvalBenchmarkExtractor):
|
|
|
99
99
|
correct = endings[answer]
|
|
100
100
|
incorrect = endings[(answer+1)%len(endings)]
|
|
101
101
|
|
|
102
|
-
|
|
103
|
-
formatted_question = f"{formatted_question}\n \nA. {incorrect}\nB. {correct}"
|
|
102
|
+
prompt = " ".join(s.strip() for s in inputs if s)
|
|
104
103
|
|
|
105
104
|
metadata = {
|
|
106
105
|
"label": "xstorycloze",
|
|
107
106
|
}
|
|
108
107
|
|
|
109
108
|
return self._build_pair(
|
|
110
|
-
question=
|
|
109
|
+
question=prompt,
|
|
111
110
|
correct=correct,
|
|
112
111
|
incorrect=incorrect,
|
|
113
112
|
metadata=metadata,
|
|
@@ -95,14 +95,14 @@ class XWinogradExtractor(LMEvalBenchmarkExtractor):
|
|
|
95
95
|
correct = options[answer]
|
|
96
96
|
incorrect = options[(answer+1)%len(options)]
|
|
97
97
|
|
|
98
|
-
|
|
98
|
+
prompt = f"Fill in the blank: {sentence}"
|
|
99
99
|
|
|
100
100
|
metadata = {
|
|
101
101
|
"label": "xwinograd",
|
|
102
102
|
}
|
|
103
103
|
|
|
104
104
|
return self._build_pair(
|
|
105
|
-
question=
|
|
105
|
+
question=prompt,
|
|
106
106
|
correct=correct,
|
|
107
107
|
incorrect=incorrect,
|
|
108
108
|
metadata=metadata,
|
|
@@ -126,14 +126,12 @@ class ZhoblimpExtractor(LMEvalBenchmarkExtractor):
|
|
|
126
126
|
incorrect_idx = (answer_idx + 1) % len(choices)
|
|
127
127
|
incorrect = choices[incorrect_idx]
|
|
128
128
|
|
|
129
|
-
formatted_question = f"Question: {question}\nA. {incorrect}\nB. {correct}"
|
|
130
|
-
|
|
131
129
|
metadata = {
|
|
132
130
|
"label": "zhoblimp",
|
|
133
131
|
}
|
|
134
132
|
|
|
135
133
|
return self._build_pair(
|
|
136
|
-
question=
|
|
134
|
+
question=question,
|
|
137
135
|
correct=correct,
|
|
138
136
|
incorrect=incorrect,
|
|
139
137
|
metadata=metadata,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import random
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from wisent.core.contrastive_pairs.lm_eval_pairs.lm_extractor_registry import get_extractor
|
|
@@ -10,17 +11,178 @@ if TYPE_CHECKING:
|
|
|
10
11
|
from lm_eval.api.task import ConfigurableTask
|
|
11
12
|
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
12
13
|
|
|
13
|
-
__all__ = ["build_contrastive_pairs"]
|
|
14
|
+
__all__ = ["build_contrastive_pairs", "lm_build_contrastive_pairs"]
|
|
14
15
|
_LOG = setup_logger(__name__)
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
def _flatten_task_dict(task_dict: dict, prefix: str = "") -> list[tuple[str, "ConfigurableTask"]]:
|
|
19
|
+
"""
|
|
20
|
+
Recursively flatten nested group tasks into a list of (name, ConfigurableTask) tuples.
|
|
21
|
+
|
|
22
|
+
arguments:
|
|
23
|
+
task_dict: Dict of task_name -> ConfigurableTask or nested dict
|
|
24
|
+
prefix: Prefix for nested task names
|
|
25
|
+
|
|
26
|
+
returns:
|
|
27
|
+
List of (full_task_name, ConfigurableTask) tuples (leaf tasks only)
|
|
28
|
+
"""
|
|
29
|
+
from lm_eval.api.task import ConfigurableTask
|
|
30
|
+
|
|
31
|
+
result = []
|
|
32
|
+
for name, task in task_dict.items():
|
|
33
|
+
full_name = f"{prefix}/{name}" if prefix else name
|
|
34
|
+
if isinstance(task, ConfigurableTask):
|
|
35
|
+
result.append((full_name, task))
|
|
36
|
+
elif isinstance(task, dict):
|
|
37
|
+
# Nested group - recurse
|
|
38
|
+
result.extend(_flatten_task_dict(task, full_name))
|
|
39
|
+
return result
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _add_evaluator_to_pairs(
|
|
43
|
+
pairs: list["ContrastivePair"],
|
|
44
|
+
evaluator_name: str | None,
|
|
45
|
+
task_name: str,
|
|
46
|
+
) -> list["ContrastivePair"]:
|
|
47
|
+
"""Add evaluator_name and task_name to each pair's metadata."""
|
|
48
|
+
from dataclasses import replace
|
|
49
|
+
|
|
50
|
+
result = []
|
|
51
|
+
for pair in pairs:
|
|
52
|
+
metadata = dict(pair.metadata) if pair.metadata else {}
|
|
53
|
+
metadata["evaluator_name"] = evaluator_name
|
|
54
|
+
metadata["source_task"] = task_name
|
|
55
|
+
result.append(replace(pair, metadata=metadata))
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def build_contrastive_pairs(
|
|
60
|
+
task_name: str,
|
|
61
|
+
limit: int | None = None,
|
|
62
|
+
) -> list["ContrastivePair"]:
|
|
63
|
+
"""
|
|
64
|
+
Unified loader for contrastive pairs - handles both HuggingFace and lm-eval tasks.
|
|
65
|
+
|
|
66
|
+
Automatically:
|
|
67
|
+
- Detects if task is HF or lm-eval
|
|
68
|
+
- Handles group tasks (including nested groups) by sampling from all subtasks
|
|
69
|
+
- Adds evaluator_name to each pair's metadata
|
|
70
|
+
|
|
71
|
+
arguments:
|
|
72
|
+
task_name:
|
|
73
|
+
Name of the benchmark/task (e.g., "winogrande", "mmlu", "humaneval").
|
|
74
|
+
limit:
|
|
75
|
+
Optional upper bound on the number of pairs to return.
|
|
76
|
+
Values <= 0 are treated as "no limit".
|
|
77
|
+
|
|
78
|
+
returns:
|
|
79
|
+
A list of ContrastivePair objects, each with metadata containing
|
|
80
|
+
'evaluator_name' and 'source_task'.
|
|
81
|
+
"""
|
|
82
|
+
log = bind(_LOG, task=task_name or "unknown")
|
|
83
|
+
log.info("Building contrastive pairs (unified)", extra={"limit": limit})
|
|
84
|
+
|
|
85
|
+
# Normalize limit
|
|
86
|
+
max_items = None if (limit is None or limit <= 0) else int(limit)
|
|
87
|
+
|
|
88
|
+
# Get extractor
|
|
89
|
+
extractor = get_extractor(task_name)
|
|
90
|
+
log.info("Using extractor", extra={"extractor": extractor.__class__.__name__})
|
|
91
|
+
|
|
92
|
+
# Get evaluator_name from extractor
|
|
93
|
+
evaluator_name = getattr(extractor, 'evaluator_name', None)
|
|
94
|
+
|
|
95
|
+
# HuggingFace extractor - load directly
|
|
96
|
+
if isinstance(extractor, HuggingFaceBenchmarkExtractor):
|
|
97
|
+
log.info("HuggingFace task - loading directly")
|
|
98
|
+
pairs = extractor.extract_contrastive_pairs(limit=max_items)
|
|
99
|
+
return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
|
|
100
|
+
|
|
101
|
+
# lm-eval extractor - need to load task
|
|
102
|
+
log.info("lm-eval task - loading via LMEvalDataLoader")
|
|
103
|
+
from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
|
|
104
|
+
|
|
105
|
+
loader = LMEvalDataLoader()
|
|
106
|
+
try:
|
|
107
|
+
task_obj = loader.load_lm_eval_task(task_name)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
log.error(f"Failed to load lm-eval task: {e}")
|
|
110
|
+
raise
|
|
111
|
+
|
|
112
|
+
# Single task (ConfigurableTask)
|
|
113
|
+
from lm_eval.api.task import ConfigurableTask
|
|
114
|
+
if isinstance(task_obj, ConfigurableTask):
|
|
115
|
+
log.info("Single task")
|
|
116
|
+
pairs = extractor.extract_contrastive_pairs(task_obj, limit=max_items)
|
|
117
|
+
return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
|
|
118
|
+
|
|
119
|
+
# Group task (dict) - flatten and sample from all subtasks
|
|
120
|
+
if isinstance(task_obj, dict):
|
|
121
|
+
leaf_tasks = _flatten_task_dict(task_obj)
|
|
122
|
+
log.info(f"Group task with {len(leaf_tasks)} leaf subtasks")
|
|
123
|
+
|
|
124
|
+
if not leaf_tasks:
|
|
125
|
+
log.warning("No leaf tasks found in group")
|
|
126
|
+
return []
|
|
127
|
+
|
|
128
|
+
# Shuffle to get random sampling across subtasks
|
|
129
|
+
random.shuffle(leaf_tasks)
|
|
130
|
+
|
|
131
|
+
# Calculate pairs per subtask
|
|
132
|
+
if max_items is None:
|
|
133
|
+
pairs_per_task = None
|
|
134
|
+
else:
|
|
135
|
+
# Distribute limit across subtasks, minimum 1 per task
|
|
136
|
+
pairs_per_task = max(1, max_items // len(leaf_tasks))
|
|
137
|
+
|
|
138
|
+
all_pairs = []
|
|
139
|
+
for subtask_name, subtask in leaf_tasks:
|
|
140
|
+
try:
|
|
141
|
+
# Get the leaf task name (last part after /)
|
|
142
|
+
leaf_name = subtask_name.split("/")[-1] if "/" in subtask_name else subtask_name
|
|
143
|
+
|
|
144
|
+
# Try to get extractor for the specific subtask first
|
|
145
|
+
try:
|
|
146
|
+
subtask_extractor = get_extractor(leaf_name)
|
|
147
|
+
except:
|
|
148
|
+
# Fall back to parent extractor
|
|
149
|
+
subtask_extractor = extractor
|
|
150
|
+
|
|
151
|
+
subtask_evaluator = getattr(subtask_extractor, 'evaluator_name', evaluator_name)
|
|
152
|
+
|
|
153
|
+
subtask_pairs = subtask_extractor.extract_contrastive_pairs(subtask, limit=pairs_per_task)
|
|
154
|
+
subtask_pairs = _add_evaluator_to_pairs(subtask_pairs, subtask_evaluator, subtask_name)
|
|
155
|
+
all_pairs.extend(subtask_pairs)
|
|
156
|
+
|
|
157
|
+
# Stop if we have enough
|
|
158
|
+
if max_items is not None and len(all_pairs) >= max_items:
|
|
159
|
+
break
|
|
160
|
+
except Exception as e:
|
|
161
|
+
log.warning(f"Failed to extract from subtask {subtask_name}: {e}")
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
# Shuffle final result and trim to limit
|
|
165
|
+
random.shuffle(all_pairs)
|
|
166
|
+
if max_items is not None:
|
|
167
|
+
all_pairs = all_pairs[:max_items]
|
|
168
|
+
|
|
169
|
+
log.info(f"Extracted {len(all_pairs)} pairs from group task")
|
|
170
|
+
return all_pairs
|
|
171
|
+
|
|
172
|
+
log.error(f"Unexpected task_obj type: {type(task_obj)}")
|
|
173
|
+
return []
|
|
174
|
+
|
|
175
|
+
|
|
17
176
|
def lm_build_contrastive_pairs(
|
|
18
177
|
task_name: str,
|
|
19
|
-
lm_eval_task: ConfigurableTask | None,
|
|
178
|
+
lm_eval_task: "ConfigurableTask | None",
|
|
20
179
|
limit: int | None = None,
|
|
21
|
-
) -> list[ContrastivePair]:
|
|
180
|
+
) -> list["ContrastivePair"]:
|
|
22
181
|
"""
|
|
23
|
-
|
|
182
|
+
Legacy function - resolve the task's extractor and return contrastive pairs.
|
|
183
|
+
|
|
184
|
+
For new code, prefer using build_contrastive_pairs() which handles
|
|
185
|
+
task loading automatically.
|
|
24
186
|
|
|
25
187
|
arguments:
|
|
26
188
|
task_name:
|
|
@@ -47,10 +209,15 @@ def lm_build_contrastive_pairs(
|
|
|
47
209
|
max_items = None if (limit is None or limit <= 0) else int(limit)
|
|
48
210
|
|
|
49
211
|
log.info("Extracting contrastive pairs", extra={"max_items": max_items})
|
|
212
|
+
|
|
213
|
+
# Get evaluator_name from extractor
|
|
214
|
+
evaluator_name = getattr(extractor, 'evaluator_name', None)
|
|
50
215
|
|
|
51
216
|
# 3) Delegate: extractor loads docs and builds pairs
|
|
52
217
|
# HuggingFace extractors don't need lm_eval_task - they load data directly from HuggingFace
|
|
53
218
|
if isinstance(extractor, HuggingFaceBenchmarkExtractor):
|
|
54
|
-
|
|
219
|
+
pairs = extractor.extract_contrastive_pairs(limit=max_items)
|
|
55
220
|
else:
|
|
56
|
-
|
|
221
|
+
pairs = extractor.extract_contrastive_pairs(lm_eval_task, limit=max_items)
|
|
222
|
+
|
|
223
|
+
return _add_evaluator_to_pairs(pairs, evaluator_name, task_name)
|
|
@@ -10,6 +10,10 @@ os.environ['TF_NUM_INTEROP_THREADS'] = '1'
|
|
|
10
10
|
os.environ['TF_NUM_INTRAOP_THREADS'] = '1'
|
|
11
11
|
os.environ['OMP_NUM_THREADS'] = '1'
|
|
12
12
|
|
|
13
|
+
# Allow code evaluation for code-related tasks (humaneval, etc.)
|
|
14
|
+
# Required by HuggingFace evaluate library for code_eval metric
|
|
15
|
+
os.environ['HF_ALLOW_CODE_EVAL'] = '1'
|
|
16
|
+
|
|
13
17
|
# Enable trust_remote_code for all datasets (required for meddialog and others)
|
|
14
18
|
# This uses lm-eval's recommended approach from PR #1998
|
|
15
19
|
import datasets.config
|
|
@@ -294,6 +298,8 @@ class LMEvalDataLoader(BaseDataLoader):
|
|
|
294
298
|
"tinytruthfulqa": "tinyTruthfulQA",
|
|
295
299
|
"tinywinogrande": "tinyWinogrande",
|
|
296
300
|
"paws-x": "pawsx",
|
|
301
|
+
# afrobench subtasks
|
|
302
|
+
"afrobench_adr": "adr",
|
|
297
303
|
}
|
|
298
304
|
|
|
299
305
|
# Use mapped name if available, otherwise use original
|
|
@@ -302,7 +308,9 @@ class LMEvalDataLoader(BaseDataLoader):
|
|
|
302
308
|
log.info(f"Mapping task '{task_name}' to lm-eval task '{lm_eval_task_name}'")
|
|
303
309
|
|
|
304
310
|
# Tasks that require case-sensitive names (don't lowercase these)
|
|
305
|
-
|
|
311
|
+
# AraDiCE tasks have mixed case (e.g., AraDiCE_ArabicMMLU_lev)
|
|
312
|
+
# aexams tasks have mixed case (e.g., aexams_IslamicStudies)
|
|
313
|
+
case_sensitive_prefixes = {"tinyBenchmarks", "AraDiCE", "aexams_"}
|
|
306
314
|
|
|
307
315
|
# Normalize task name to lowercase for lm-eval-harness compatibility
|
|
308
316
|
# Many lm-eval tasks use lowercase names (e.g., "aradice" not "AraDICE")
|
|
@@ -379,6 +387,9 @@ class LMEvalDataLoader(BaseDataLoader):
|
|
|
379
387
|
"noreval": ["ask_gec_p0", "ask_gec_p1", "ask_gec_p2", "ask_gec_p3", "ask_gec_p4", "ncb", "norbelebele_p0", "norbelebele_p1", "norbelebele_p2", "norbelebele_p3", "norbelebele_p4", "norcommonsenseqa_nno_p0", "norcommonsenseqa_nno_p1", "norcommonsenseqa_nno_p2", "norcommonsenseqa_nno_p3", "norcommonsenseqa_nno_p4", "norcommonsenseqa_nob_p0", "norcommonsenseqa_nob_p1", "norcommonsenseqa_nob_p2", "norcommonsenseqa_nob_p3", "norcommonsenseqa_nob_p4", "norec_document_p0", "norec_document_p1", "norec_document_p2", "norec_document_p3", "norec_document_p4", "norec_sentence_p0", "norec_sentence_p1", "norec_sentence_p2", "norec_sentence_p3", "norec_sentence_p4", "noridiom_nno_p0", "noridiom_nno_p1", "noridiom_nno_p2", "noridiom_nno_p3", "noridiom_nno_p4", "noridiom_nob_p0", "noridiom_nob_p1", "noridiom_nob_p2", "noridiom_nob_p3", "noridiom_nob_p4", "noropenbookqa_nno_p0", "noropenbookqa_nno_p1", "noropenbookqa_nno_p2", "noropenbookqa_nno_p3", "noropenbookqa_nno_p4", "noropenbookqa_nob_p0", "noropenbookqa_nob_p1", "noropenbookqa_nob_p2", "noropenbookqa_nob_p3", "noropenbookqa_nob_p4", "norquad_p0", "norquad_p1", "norquad_p2", "norquad_p3", "norquad_p4", "norrewrite_instruct", "norsumm_nno_p0", "norsumm_nno_p1", "norsumm_nno_p2", "norsumm_nno_p3", "norsumm_nno_p4", "norsumm_nno_p5", "norsumm_nob_p0", "norsumm_nob_p1", "norsumm_nob_p2", "norsumm_nob_p3", "norsumm_nob_p4", "norsumm_nob_p5", "norsummarize_instruct", "nortruthfulqa_gen_nno_p0", "nortruthfulqa_gen_nno_p1", "nortruthfulqa_gen_nno_p2", "nortruthfulqa_gen_nno_p3", "nortruthfulqa_gen_nno_p4", "nortruthfulqa_gen_nob_p0", "nortruthfulqa_gen_nob_p1", "nortruthfulqa_gen_nob_p2", "nortruthfulqa_gen_nob_p3", "nortruthfulqa_gen_nob_p4", "nortruthfulqa_mc_nno_p0", "nortruthfulqa_mc_nno_p1", "nortruthfulqa_mc_nno_p2", "nortruthfulqa_mc_nno_p3", "nortruthfulqa_mc_nno_p4", "nortruthfulqa_mc_nob_p0", "nortruthfulqa_mc_nob_p1", "nortruthfulqa_mc_nob_p2", "nortruthfulqa_mc_nob_p3", "nortruthfulqa_mc_nob_p4", "nrk_quiz_qa_nno_p0", "nrk_quiz_qa_nno_p1", "nrk_quiz_qa_nno_p2", "nrk_quiz_qa_nno_p3", "nrk_quiz_qa_nno_p4", "nrk_quiz_qa_nob_p0", "nrk_quiz_qa_nob_p1", "nrk_quiz_qa_nob_p2", "nrk_quiz_qa_nob_p3", "nrk_quiz_qa_nob_p4", "tatoeba_eng_nno_p0", "tatoeba_eng_nno_p1", "tatoeba_eng_nno_p2", "tatoeba_eng_nno_p3", "tatoeba_eng_nob_p0", "tatoeba_eng_nob_p1", "tatoeba_eng_nob_p2", "tatoeba_eng_nob_p3", "tatoeba_nno_eng_p0", "tatoeba_nno_eng_p1", "tatoeba_nno_eng_p2", "tatoeba_nno_eng_p3", "tatoeba_nob_eng_p0", "tatoeba_nob_eng_p1", "tatoeba_nob_eng_p2", "tatoeba_nob_eng_p3"],
|
|
380
388
|
"storycloze": ["xstorycloze_en"],
|
|
381
389
|
"instructhumaneval": ["humaneval_instruct"],
|
|
390
|
+
# African language benchmarks
|
|
391
|
+
"afrimgsm": ["afrimgsm_amh_prompt_1", "afrimgsm_eng_prompt_1", "afrimgsm_fra_prompt_1", "afrimgsm_hau_prompt_1", "afrimgsm_ibo_prompt_1", "afrimgsm_kin_prompt_1", "afrimgsm_swa_prompt_1", "afrimgsm_yor_prompt_1"],
|
|
392
|
+
"afrimmlu": ["afrimmlu_direct_amh_prompt_1", "afrimmlu_direct_eng_prompt_1", "afrimmlu_direct_fra_prompt_1", "afrimmlu_direct_hau_prompt_1", "afrimmlu_direct_ibo_prompt_1", "afrimmlu_direct_kin_prompt_1", "afrimmlu_direct_swa_prompt_1", "afrimmlu_direct_yor_prompt_1"],
|
|
382
393
|
}
|
|
383
394
|
|
|
384
395
|
# Check if task is explicitly disabled
|