wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/comparison/__init__.py +1 -0
- wisent/comparison/detect_bos_features.py +275 -0
- wisent/comparison/fgaa.py +465 -0
- wisent/comparison/lora.py +669 -0
- wisent/comparison/lora_dpo.py +592 -0
- wisent/comparison/main.py +444 -0
- wisent/comparison/ours.py +76 -0
- wisent/comparison/sae.py +304 -0
- wisent/comparison/utils.py +381 -0
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +12 -7
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +260 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/train_classifier.py +16 -3
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/generate_vector_from_task.py +11 -20
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +2 -2
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +22 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +10 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +9 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +6 -13
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/get_activations_parser.py +5 -14
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/METADATA +5 -1
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/RECORD +328 -358
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
- wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
- wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
- wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cola_pairs.json +0 -8
- wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/2/test_atis_pairs.json +0 -8
- wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/2/test_babi_pairs.json +0 -8
- wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import json
|
|
4
|
+
import requests
|
|
3
5
|
from typing import Any
|
|
4
6
|
from wisent.core.cli_logger import setup_logger
|
|
5
7
|
|
|
@@ -10,46 +12,53 @@ __all__ = ["RecodeExtractor"]
|
|
|
10
12
|
|
|
11
13
|
log = setup_logger(__name__)
|
|
12
14
|
|
|
15
|
+
# GitHub URL for ReCode dataset
|
|
16
|
+
RECODE_GITHUB_URL = "https://raw.githubusercontent.com/amazon-science/recode/main/dataset-release/nominal/HumanEval.jsonl"
|
|
17
|
+
|
|
13
18
|
|
|
14
19
|
class RecodeExtractor(HuggingFaceBenchmarkExtractor):
|
|
15
20
|
"""
|
|
16
|
-
Extractor for
|
|
21
|
+
Extractor for ReCode - Robustness Evaluation of Code Generation Models.
|
|
17
22
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
- target: str (answer/solution)
|
|
23
|
+
GitHub: https://github.com/amazon-science/recode
|
|
24
|
+
Paper: "ReCode: Robustness Evaluation of Code Generation Models" (arXiv:2212.10264)
|
|
21
25
|
|
|
22
|
-
|
|
26
|
+
ReCode evaluates code generation robustness using perturbed HumanEval/MBPP.
|
|
27
|
+
The dataset includes:
|
|
28
|
+
- Nominal (original) problems
|
|
29
|
+
- Perturbed versions (docstring, function name, code syntax changes)
|
|
30
|
+
|
|
31
|
+
Schema (HumanEval.jsonl):
|
|
32
|
+
- task_id: str
|
|
33
|
+
- prompt: str (function signature with docstring)
|
|
34
|
+
- canonical_solution: str (reference solution)
|
|
35
|
+
- test: str (test cases)
|
|
36
|
+
- entry_point: str (function name)
|
|
37
|
+
|
|
38
|
+
For robustness evaluation:
|
|
39
|
+
- Positive (correct) = Canonical solution
|
|
40
|
+
- Negative (incorrect) = Buggy/incomplete solution
|
|
23
41
|
"""
|
|
24
42
|
|
|
25
|
-
evaluator_name = "
|
|
43
|
+
evaluator_name = "code_generation"
|
|
26
44
|
|
|
27
45
|
def extract_contrastive_pairs(
|
|
28
46
|
self,
|
|
29
47
|
limit: int | None = None,
|
|
30
48
|
) -> list[ContrastivePair]:
|
|
31
49
|
"""
|
|
32
|
-
Build contrastive pairs from
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
limit: Optional maximum number of pairs to produce.
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
A list of ContrastivePair objects.
|
|
50
|
+
Build contrastive pairs from ReCode GitHub repository.
|
|
39
51
|
"""
|
|
40
52
|
max_items = self._normalize_limit(limit)
|
|
41
|
-
|
|
42
|
-
# Load dataset - using code_x_glue as alternative since ARR-ADAPT/recode doesn't exist
|
|
43
|
-
docs = self.load_dataset(
|
|
44
|
-
dataset_name="code_x_glue_tc_nl_code_search_adv",
|
|
45
|
-
dataset_config="default",
|
|
46
|
-
split="train",
|
|
47
|
-
limit=max_items,
|
|
48
|
-
)
|
|
49
|
-
|
|
50
53
|
pairs: list[ContrastivePair] = []
|
|
51
54
|
|
|
52
|
-
|
|
55
|
+
docs = self._load_from_github()
|
|
56
|
+
|
|
57
|
+
if not docs:
|
|
58
|
+
log.error("Failed to load ReCode data from GitHub")
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
log.info(f"Loaded {len(docs)} problems from ReCode GitHub")
|
|
53
62
|
|
|
54
63
|
for doc in docs:
|
|
55
64
|
pair = self._extract_pair_from_doc(doc)
|
|
@@ -59,73 +68,79 @@ class RecodeExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
59
68
|
break
|
|
60
69
|
|
|
61
70
|
if not pairs:
|
|
62
|
-
log.warning("No valid
|
|
71
|
+
log.warning("No valid ReCode pairs extracted")
|
|
63
72
|
|
|
64
73
|
return pairs
|
|
65
74
|
|
|
66
|
-
def
|
|
67
|
-
"""
|
|
68
|
-
|
|
75
|
+
def _load_from_github(self) -> list[dict[str, Any]]:
|
|
76
|
+
"""Load ReCode data from GitHub JSONL file."""
|
|
77
|
+
try:
|
|
78
|
+
response = requests.get(RECODE_GITHUB_URL, timeout=60)
|
|
79
|
+
response.raise_for_status()
|
|
80
|
+
|
|
81
|
+
problems = []
|
|
82
|
+
for line in response.text.strip().split('\n'):
|
|
83
|
+
if line.strip():
|
|
84
|
+
problems.append(json.loads(line))
|
|
85
|
+
|
|
86
|
+
return problems
|
|
87
|
+
|
|
88
|
+
except Exception as e:
|
|
89
|
+
log.error(f"Failed to load ReCode from GitHub: {e}")
|
|
90
|
+
return []
|
|
69
91
|
|
|
70
|
-
|
|
71
|
-
"""
|
|
92
|
+
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
93
|
+
"""Convert a single doc into a ContrastivePair."""
|
|
72
94
|
try:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
95
|
+
task_id = doc.get("task_id", "")
|
|
96
|
+
prompt = doc.get("prompt", "").strip()
|
|
97
|
+
canonical_solution = doc.get("canonical_solution", "").strip()
|
|
98
|
+
entry_point = doc.get("entry_point", "")
|
|
76
99
|
|
|
77
|
-
if not
|
|
78
|
-
log.debug("Skipping: missing question or answer")
|
|
100
|
+
if not prompt or not canonical_solution:
|
|
79
101
|
return None
|
|
80
102
|
|
|
81
|
-
#
|
|
82
|
-
|
|
103
|
+
# Full correct code = prompt + solution
|
|
104
|
+
correct_code = prompt + canonical_solution
|
|
105
|
+
|
|
106
|
+
# Create incorrect by truncating/corrupting
|
|
107
|
+
incorrect_code = self._create_incorrect_solution(prompt, canonical_solution)
|
|
108
|
+
|
|
109
|
+
formatted_question = f"""Code Generation Task ({task_id}):
|
|
83
110
|
|
|
84
|
-
|
|
85
|
-
incorrect_answer = self._create_incorrect_answer(correct_answer)
|
|
111
|
+
{prompt}
|
|
86
112
|
|
|
87
|
-
|
|
88
|
-
formatted_question = f"Question: {question}\n\nWhat is the answer?"
|
|
113
|
+
Complete the function implementation."""
|
|
89
114
|
|
|
90
115
|
metadata = {
|
|
91
116
|
"label": "recode",
|
|
92
|
-
"source": "
|
|
117
|
+
"source": "amazon-science/recode",
|
|
118
|
+
"task_id": task_id,
|
|
119
|
+
"entry_point": entry_point,
|
|
120
|
+
"is_code_robustness_benchmark": True,
|
|
93
121
|
}
|
|
94
122
|
|
|
95
123
|
return self._build_pair(
|
|
96
124
|
question=formatted_question,
|
|
97
|
-
correct=
|
|
98
|
-
incorrect=
|
|
125
|
+
correct=f"```python\n{correct_code}\n```",
|
|
126
|
+
incorrect=f"```python\n{incorrect_code}\n```",
|
|
99
127
|
metadata=metadata,
|
|
100
128
|
)
|
|
101
129
|
|
|
102
130
|
except Exception as exc:
|
|
103
|
-
log.error(f"Error extracting pair
|
|
131
|
+
log.error(f"Error extracting ReCode pair: {exc}", exc_info=True)
|
|
104
132
|
return None
|
|
105
133
|
|
|
106
|
-
def
|
|
107
|
-
"""Create an incorrect
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
if len(
|
|
111
|
-
#
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# Verify correct is not still a substring of incorrect
|
|
120
|
-
if correct in incorrect:
|
|
121
|
-
# Completely different function
|
|
122
|
-
incorrect = "def invalid_function():\n '''This is intentionally wrong code'''\n raise SyntaxError('Corrupted')"
|
|
123
|
-
|
|
124
|
-
return incorrect
|
|
125
|
-
else:
|
|
126
|
-
# Not a function definition, use generic corruption
|
|
127
|
-
incorrect = "# CORRUPTED CODE\n" + correct + "\n# REST IS INVALID"
|
|
128
|
-
return incorrect
|
|
129
|
-
|
|
130
|
-
return f"INVALID_{correct}"
|
|
134
|
+
def _create_incorrect_solution(self, prompt: str, solution: str) -> str:
|
|
135
|
+
"""Create an incorrect solution by truncating or corrupting."""
|
|
136
|
+
lines = solution.split('\n')
|
|
137
|
+
|
|
138
|
+
if len(lines) > 2:
|
|
139
|
+
# Truncate to first half + pass
|
|
140
|
+
half = len(lines) // 2
|
|
141
|
+
buggy = '\n'.join(lines[:half]) + '\n pass # incomplete'
|
|
142
|
+
else:
|
|
143
|
+
buggy = ' pass # not implemented'
|
|
144
|
+
|
|
145
|
+
return prompt + buggy
|
|
131
146
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import random
|
|
3
4
|
from typing import Any
|
|
5
|
+
from datasets import load_dataset
|
|
4
6
|
from wisent.core.cli_logger import setup_logger
|
|
5
7
|
|
|
6
8
|
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
@@ -10,40 +12,41 @@ __all__ = ["RefusalBenchExtractor"]
|
|
|
10
12
|
|
|
11
13
|
log = setup_logger(__name__)
|
|
12
14
|
|
|
13
|
-
# RefusalBench perturbation categories
|
|
14
|
-
PERTURBATION_CATEGORIES =
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
|
|
15
|
+
# RefusalBench perturbation categories (from paper arXiv:2510.10390)
|
|
16
|
+
PERTURBATION_CATEGORIES = {
|
|
17
|
+
"P-Ambiguity": "REFUSE_AMBIGUOUS",
|
|
18
|
+
"P-Contradiction": "REFUSE_CONTRADICTORY",
|
|
19
|
+
"P-MissingInfo": "REFUSE_MISSING",
|
|
20
|
+
"P-FalsePremise": "REFUSE_FALSE_PREMISE",
|
|
21
|
+
"P-GranularityMismatch": "REFUSE_GRANULARITY",
|
|
22
|
+
"P-EpistemicMismatch": "REFUSE_NONFACTUAL",
|
|
23
|
+
}
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
class RefusalBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
25
27
|
"""
|
|
26
|
-
Extractor for RefusalBench - Selective Refusal in Grounded LLMs (
|
|
28
|
+
Extractor for RefusalBench - Selective Refusal in Grounded LLMs (arXiv:2510.10390).
|
|
27
29
|
|
|
28
|
-
RefusalBench
|
|
29
|
-
|
|
30
|
+
RefusalBench is a GENERATIVE methodology that creates diagnostic test cases
|
|
31
|
+
through controlled linguistic perturbation. Uses 176 perturbation strategies
|
|
30
32
|
across 6 categories of informational uncertainty.
|
|
31
33
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
- Models exhibit dangerous overconfidence or overcaution
|
|
35
|
-
- Refusal requires separable detection and categorization skills
|
|
34
|
+
This implementation applies perturbations to NaturalQuestions dataset
|
|
35
|
+
to create selective refusal evaluation pairs.
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
-
|
|
39
|
-
-
|
|
37
|
+
Perturbation categories:
|
|
38
|
+
- P-Ambiguity: Multiple plausible interpretations
|
|
39
|
+
- P-Contradiction: Logically inconsistent facts
|
|
40
|
+
- P-MissingInfo: Critical information absent
|
|
41
|
+
- P-FalsePremise: Query built on false presupposition
|
|
42
|
+
- P-GranularityMismatch: Wrong level of detail
|
|
43
|
+
- P-EpistemicMismatch: Subjective query from factual context
|
|
40
44
|
|
|
41
45
|
For selective refusal evaluation:
|
|
42
|
-
- Positive (correct) = Appropriate refusal
|
|
46
|
+
- Positive (correct) = Appropriate refusal with correct category
|
|
43
47
|
- Negative (incorrect) = Confident answer despite flawed context
|
|
44
48
|
"""
|
|
45
49
|
|
|
46
|
-
# Evaluator that should be used for this benchmark
|
|
47
50
|
evaluator_name = "selective_refusal"
|
|
48
51
|
|
|
49
52
|
def __init__(self, perturbation_type: str | None = None):
|
|
@@ -61,11 +64,7 @@ class RefusalBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
61
64
|
limit: int | None = None,
|
|
62
65
|
) -> list[ContrastivePair]:
|
|
63
66
|
"""
|
|
64
|
-
Build contrastive pairs
|
|
65
|
-
|
|
66
|
-
Creates pairs for selective refusal evaluation:
|
|
67
|
-
- Positive (correct) = Refuses when appropriate
|
|
68
|
-
- Negative (incorrect) = Answers confidently despite uncertainty
|
|
67
|
+
Build contrastive pairs by applying RefusalBench perturbations to NaturalQuestions.
|
|
69
68
|
|
|
70
69
|
Args:
|
|
71
70
|
limit: Optional maximum number of pairs to produce.
|
|
@@ -75,14 +74,12 @@ class RefusalBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
75
74
|
"""
|
|
76
75
|
max_items = self._normalize_limit(limit)
|
|
77
76
|
|
|
78
|
-
|
|
79
|
-
docs
|
|
80
|
-
log.info(f"Created {len(docs)} RefusalBench-style examples")
|
|
77
|
+
docs = self._load_and_perturb_nq()
|
|
78
|
+
log.info(f"Created {len(docs)} RefusalBench perturbation examples")
|
|
81
79
|
|
|
82
80
|
pairs: list[ContrastivePair] = []
|
|
83
81
|
|
|
84
82
|
for doc in docs:
|
|
85
|
-
# Filter by perturbation type if specified
|
|
86
83
|
if self.perturbation_type:
|
|
87
84
|
doc_type = doc.get("perturbation_category", "")
|
|
88
85
|
if self.perturbation_type.lower() not in doc_type.lower():
|
|
@@ -99,123 +96,129 @@ class RefusalBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
99
96
|
|
|
100
97
|
return pairs
|
|
101
98
|
|
|
102
|
-
def
|
|
103
|
-
"""
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
"
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
99
|
+
def _load_and_perturb_nq(self) -> list[dict[str, Any]]:
|
|
100
|
+
"""
|
|
101
|
+
Load NaturalQuestions and apply RefusalBench-style perturbations.
|
|
102
|
+
|
|
103
|
+
RefusalBench methodology: Transform answerable QA pairs into
|
|
104
|
+
unanswerable ones using linguistic perturbations.
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
ds = load_dataset("natural_questions", "default", split="train", streaming=True)
|
|
108
|
+
|
|
109
|
+
examples = []
|
|
110
|
+
perturbation_types = list(PERTURBATION_CATEGORIES.keys())
|
|
111
|
+
|
|
112
|
+
for i, item in enumerate(ds):
|
|
113
|
+
if i >= 500:
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
question = item.get("question", {}).get("text", "")
|
|
117
|
+
annotations = item.get("annotations", {})
|
|
118
|
+
|
|
119
|
+
if not question:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
short_answers = []
|
|
123
|
+
if annotations and "short_answers" in annotations:
|
|
124
|
+
for sa in annotations["short_answers"]:
|
|
125
|
+
if sa.get("text"):
|
|
126
|
+
short_answers.append(sa["text"])
|
|
127
|
+
|
|
128
|
+
if not short_answers:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
context = item.get("document", {}).get("tokens", {}).get("token", [])
|
|
132
|
+
if isinstance(context, list):
|
|
133
|
+
context = " ".join(context[:500])
|
|
134
|
+
|
|
135
|
+
if not context or len(context) < 100:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
perturbation = random.choice(perturbation_types)
|
|
139
|
+
perturbed = self._apply_perturbation(
|
|
140
|
+
question, context, short_answers[0], perturbation
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
examples.append({
|
|
144
|
+
"case_id": f"rb_nq_{i:05d}",
|
|
145
|
+
"original_question": question,
|
|
146
|
+
"original_answer": short_answers[0],
|
|
147
|
+
"perturbed_question": perturbed["question"],
|
|
148
|
+
"perturbed_context": perturbed["context"],
|
|
149
|
+
"perturbation_category": perturbation,
|
|
150
|
+
"expected_refusal": PERTURBATION_CATEGORIES[perturbation],
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
return examples
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
log.error(f"Failed to load NaturalQuestions: {e}")
|
|
157
|
+
raise RuntimeError(f"Cannot create RefusalBench data: {e}")
|
|
158
|
+
|
|
159
|
+
def _apply_perturbation(
|
|
160
|
+
self, question: str, context: str, answer: str, perturbation: str
|
|
161
|
+
) -> dict[str, str]:
|
|
162
|
+
"""Apply a specific perturbation strategy."""
|
|
163
|
+
|
|
164
|
+
if perturbation == "P-MissingInfo":
|
|
165
|
+
perturbed_context = context.replace(answer, "[REDACTED]")
|
|
166
|
+
return {"question": question, "context": perturbed_context}
|
|
167
|
+
|
|
168
|
+
elif perturbation == "P-Contradiction":
|
|
169
|
+
contradiction = f"However, other sources indicate this is incorrect. "
|
|
170
|
+
perturbed_context = context + " " + contradiction
|
|
171
|
+
return {"question": question, "context": perturbed_context}
|
|
172
|
+
|
|
173
|
+
elif perturbation == "P-Ambiguity":
|
|
174
|
+
perturbed_question = f"Regarding the ambiguous reference, {question.lower()}"
|
|
175
|
+
return {"question": perturbed_question, "context": context}
|
|
176
|
+
|
|
177
|
+
elif perturbation == "P-FalsePremise":
|
|
178
|
+
perturbed_question = f"Given that {answer} was proven false, {question.lower()}"
|
|
179
|
+
return {"question": perturbed_question, "context": context}
|
|
180
|
+
|
|
181
|
+
elif perturbation == "P-GranularityMismatch":
|
|
182
|
+
perturbed_question = f"What is the exact statistical breakdown of {question.lower()}"
|
|
183
|
+
return {"question": perturbed_question, "context": context}
|
|
184
|
+
|
|
185
|
+
elif perturbation == "P-EpistemicMismatch":
|
|
186
|
+
perturbed_question = f"In your opinion, why is {question.lower()}"
|
|
187
|
+
return {"question": perturbed_question, "context": context}
|
|
188
|
+
|
|
189
|
+
return {"question": question, "context": context}
|
|
180
190
|
|
|
181
191
|
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
182
192
|
"""
|
|
183
|
-
Convert a
|
|
193
|
+
Convert a perturbed NQ example into a ContrastivePair.
|
|
194
|
+
|
|
195
|
+
RefusalBench format:
|
|
196
|
+
{"case_id": "...", "original_question": "...", "original_answer": "...",
|
|
197
|
+
"perturbed_question": "...", "perturbed_context": "...",
|
|
198
|
+
"perturbation_category": "P-MissingInfo", "expected_refusal": "REFUSE_MISSING"}
|
|
184
199
|
"""
|
|
185
200
|
try:
|
|
186
201
|
case_id = doc.get("case_id", "")
|
|
187
|
-
|
|
188
|
-
|
|
202
|
+
perturbed_question = doc.get("perturbed_question", "").strip()
|
|
203
|
+
perturbed_context = doc.get("perturbed_context", "").strip()
|
|
189
204
|
perturbation_category = doc.get("perturbation_category", "")
|
|
190
|
-
|
|
191
|
-
should_refuse = doc.get("should_refuse", True)
|
|
192
|
-
answer = doc.get("answer", "")
|
|
205
|
+
expected_refusal = doc.get("expected_refusal", "")
|
|
193
206
|
|
|
194
|
-
if not
|
|
207
|
+
if not perturbed_question or not perturbed_context:
|
|
195
208
|
log.debug("Skipping: missing question or context")
|
|
196
209
|
return None
|
|
197
210
|
|
|
198
|
-
|
|
199
|
-
task_prompt = self._build_rag_prompt(question, context)
|
|
211
|
+
task_prompt = self._build_rag_prompt(perturbed_question, perturbed_context)
|
|
200
212
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
correct_response = self._create_refusal_response(
|
|
204
|
-
question, perturbation_description
|
|
205
|
-
)
|
|
206
|
-
incorrect_response = self._create_overconfident_response(question)
|
|
207
|
-
else:
|
|
208
|
-
# For answerable cases
|
|
209
|
-
correct_response = self._create_grounded_answer(question, answer)
|
|
210
|
-
incorrect_response = self._create_overcautious_refusal()
|
|
213
|
+
correct_response = self._create_refusal_response(expected_refusal)
|
|
214
|
+
incorrect_response = self._create_overconfident_response()
|
|
211
215
|
|
|
212
216
|
metadata = {
|
|
213
217
|
"label": "refusalbench",
|
|
214
|
-
"source": "
|
|
218
|
+
"source": "RefusalBench-NQ",
|
|
215
219
|
"case_id": case_id,
|
|
216
220
|
"perturbation_category": perturbation_category,
|
|
217
|
-
"
|
|
218
|
-
"should_refuse": should_refuse,
|
|
221
|
+
"expected_refusal": expected_refusal,
|
|
219
222
|
"is_selective_refusal_benchmark": True,
|
|
220
223
|
}
|
|
221
224
|
|
|
@@ -237,44 +240,49 @@ class RefusalBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
237
240
|
f"Question: {question}\n\n"
|
|
238
241
|
"Based solely on the provided context, answer the question. "
|
|
239
242
|
"If the context does not contain sufficient or reliable information "
|
|
240
|
-
"to answer the question,
|
|
243
|
+
"to answer the question, refuse with the appropriate reason."
|
|
241
244
|
)
|
|
242
245
|
|
|
243
|
-
def _create_refusal_response(
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
"
|
|
252
|
-
|
|
246
|
+
def _create_refusal_response(self, expected_refusal: str) -> str:
|
|
247
|
+
"""Create an appropriate refusal response based on category."""
|
|
248
|
+
refusal_responses = {
|
|
249
|
+
"REFUSE_AMBIGUOUS": (
|
|
250
|
+
"I cannot provide a confident answer because the query or context "
|
|
251
|
+
"creates multiple plausible interpretations, making a single "
|
|
252
|
+
"definitive answer impossible."
|
|
253
|
+
),
|
|
254
|
+
"REFUSE_CONTRADICTORY": (
|
|
255
|
+
"I cannot provide a confident answer because the context contains "
|
|
256
|
+
"logically inconsistent facts that contradict each other."
|
|
257
|
+
),
|
|
258
|
+
"REFUSE_MISSING": (
|
|
259
|
+
"I cannot provide a confident answer because critical information "
|
|
260
|
+
"needed to answer this question is absent from the context."
|
|
261
|
+
),
|
|
262
|
+
"REFUSE_FALSE_PREMISE": (
|
|
263
|
+
"I cannot provide a confident answer because the question is built "
|
|
264
|
+
"on a presupposition that is contradicted by the context."
|
|
265
|
+
),
|
|
266
|
+
"REFUSE_GRANULARITY": (
|
|
267
|
+
"I cannot provide a confident answer because there is a mismatch "
|
|
268
|
+
"between the requested and available level of detail."
|
|
269
|
+
),
|
|
270
|
+
"REFUSE_NONFACTUAL": (
|
|
271
|
+
"I cannot provide a confident answer because the query requests "
|
|
272
|
+
"subjective opinions or predictions from factual context."
|
|
273
|
+
),
|
|
274
|
+
}
|
|
275
|
+
return refusal_responses.get(
|
|
276
|
+
expected_refusal,
|
|
277
|
+
"I cannot provide a confident answer based on the provided context."
|
|
253
278
|
)
|
|
254
279
|
|
|
255
|
-
def _create_overconfident_response(self
|
|
280
|
+
def _create_overconfident_response(self) -> str:
|
|
256
281
|
"""Create an overconfident response despite uncertain context."""
|
|
257
282
|
return (
|
|
258
|
-
|
|
283
|
+
"Based on the context provided, the answer is clear. "
|
|
259
284
|
"I'm confident that the information given supports my response. "
|
|
260
285
|
"The context definitively indicates the answer, and there is no "
|
|
261
286
|
"ambiguity in interpreting this information."
|
|
262
287
|
)
|
|
263
288
|
|
|
264
|
-
def _create_grounded_answer(self, question: str, answer: str) -> str:
|
|
265
|
-
"""Create a properly grounded answer for answerable questions."""
|
|
266
|
-
return (
|
|
267
|
-
f"Based on the provided context, the answer is: {answer}. "
|
|
268
|
-
"This information is directly stated in the context and I'm confident "
|
|
269
|
-
"in this response as it comes from the source material provided."
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
def _create_overcautious_refusal(self) -> str:
|
|
273
|
-
"""Create an overcautious refusal for answerable questions."""
|
|
274
|
-
return (
|
|
275
|
-
"I'm unable to answer this question as the context may not provide "
|
|
276
|
-
"sufficient information. There could be uncertainty or missing details "
|
|
277
|
-
"that prevent me from giving a definitive answer. I prefer to err on "
|
|
278
|
-
"the side of caution and decline to respond."
|
|
279
|
-
)
|
|
280
|
-
|