wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +9 -5
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +256 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +5 -7
- wisent/core/cli/agent/train_classifier.py +19 -7
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import json
|
|
3
4
|
from typing import Any
|
|
4
5
|
from wisent.core.cli_logger import setup_logger
|
|
5
6
|
|
|
@@ -13,43 +14,45 @@ log = setup_logger(__name__)
|
|
|
13
14
|
|
|
14
15
|
class MercuryExtractor(HuggingFaceBenchmarkExtractor):
|
|
15
16
|
"""
|
|
16
|
-
Extractor for
|
|
17
|
+
Extractor for Mercury - code efficiency benchmark.
|
|
17
18
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
- cs: str (c# code/answer)
|
|
19
|
+
Dataset: Elfsong/Mercury
|
|
20
|
+
Paper: "Mercury: A Code Efficiency Benchmark for LLM Code Synthesis"
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
Mercury evaluates code efficiency by comparing different solutions
|
|
23
|
+
to the same problem based on runtime performance.
|
|
24
|
+
|
|
25
|
+
Schema:
|
|
26
|
+
- prompt: str (problem description)
|
|
27
|
+
- solutions: list[dict] with runtime and solution code
|
|
28
|
+
- test_cases: str (JSON with test inputs/outputs)
|
|
29
|
+
- difficulty: str
|
|
30
|
+
|
|
31
|
+
For code efficiency evaluation:
|
|
32
|
+
- Positive (correct) = Fastest solution
|
|
33
|
+
- Negative (incorrect) = Slowest solution
|
|
23
34
|
"""
|
|
24
35
|
|
|
25
|
-
evaluator_name = "
|
|
36
|
+
evaluator_name = "code_efficiency"
|
|
26
37
|
|
|
27
38
|
def extract_contrastive_pairs(
|
|
28
39
|
self,
|
|
29
40
|
limit: int | None = None,
|
|
30
41
|
) -> list[ContrastivePair]:
|
|
31
42
|
"""
|
|
32
|
-
Build contrastive pairs from
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
limit: Optional maximum number of pairs to produce.
|
|
36
|
-
|
|
37
|
-
Returns:
|
|
38
|
-
A list of ContrastivePair objects.
|
|
43
|
+
Build contrastive pairs from Mercury examples.
|
|
39
44
|
"""
|
|
40
45
|
max_items = self._normalize_limit(limit)
|
|
41
46
|
|
|
42
|
-
# Load dataset - using code_x_glue as alternative since tau/code_translation doesn't exist
|
|
43
47
|
docs = self.load_dataset(
|
|
44
|
-
dataset_name="
|
|
45
|
-
|
|
46
|
-
split="train",
|
|
48
|
+
dataset_name="Elfsong/Mercury",
|
|
49
|
+
split="eval",
|
|
47
50
|
limit=max_items,
|
|
48
51
|
)
|
|
49
52
|
|
|
50
53
|
pairs: list[ContrastivePair] = []
|
|
51
54
|
|
|
52
|
-
log.info(f"Extracting contrastive pairs from {len(docs)}
|
|
55
|
+
log.info(f"Extracting contrastive pairs from {len(docs)} Mercury examples")
|
|
53
56
|
|
|
54
57
|
for doc in docs:
|
|
55
58
|
pair = self._extract_pair_from_doc(doc)
|
|
@@ -59,53 +62,73 @@ class MercuryExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
59
62
|
break
|
|
60
63
|
|
|
61
64
|
if not pairs:
|
|
62
|
-
log.warning("No valid
|
|
65
|
+
log.warning("No valid Mercury pairs extracted")
|
|
63
66
|
|
|
64
67
|
return pairs
|
|
65
68
|
|
|
66
69
|
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
67
70
|
"""
|
|
68
71
|
Convert a single doc into a ContrastivePair.
|
|
69
|
-
|
|
70
|
-
|
|
72
|
+
|
|
73
|
+
Uses fastest vs slowest solution as correct vs incorrect.
|
|
71
74
|
"""
|
|
72
75
|
try:
|
|
73
|
-
|
|
74
|
-
|
|
76
|
+
prompt = doc.get("prompt", "").strip()
|
|
77
|
+
solutions = doc.get("solutions", [])
|
|
78
|
+
difficulty = doc.get("difficulty", "")
|
|
79
|
+
slug_name = doc.get("slug_name", "")
|
|
80
|
+
pretty_content = doc.get("pretty_content", [])
|
|
75
81
|
|
|
76
|
-
if not
|
|
77
|
-
log.debug("Skipping: missing question or answer")
|
|
82
|
+
if not prompt or not solutions or len(solutions) < 2:
|
|
78
83
|
return None
|
|
79
84
|
|
|
80
|
-
#
|
|
81
|
-
|
|
85
|
+
# Sort solutions by runtime (fastest first)
|
|
86
|
+
# Runtime format is like "44ms", "36ms", etc.
|
|
87
|
+
def parse_runtime(sol):
|
|
88
|
+
runtime_str = sol.get("runtime", "999ms")
|
|
89
|
+
try:
|
|
90
|
+
return int(runtime_str.replace("ms", ""))
|
|
91
|
+
except:
|
|
92
|
+
return 999
|
|
93
|
+
|
|
94
|
+
sorted_solutions = sorted(solutions, key=parse_runtime)
|
|
95
|
+
|
|
96
|
+
fastest = sorted_solutions[0]
|
|
97
|
+
slowest = sorted_solutions[-1]
|
|
98
|
+
|
|
99
|
+
fastest_code = fastest.get("solution", "")
|
|
100
|
+
slowest_code = slowest.get("solution", "")
|
|
101
|
+
|
|
102
|
+
if not fastest_code or not slowest_code:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
# Use pretty_content if available for problem description
|
|
106
|
+
problem_desc = pretty_content[0] if pretty_content else prompt
|
|
82
107
|
|
|
83
|
-
|
|
84
|
-
incorrect_answer = self._create_incorrect_answer(correct_answer)
|
|
108
|
+
formatted_question = f"""Code Efficiency Task:
|
|
85
109
|
|
|
86
|
-
|
|
87
|
-
|
|
110
|
+
{problem_desc}
|
|
111
|
+
|
|
112
|
+
Write an efficient Python solution."""
|
|
88
113
|
|
|
89
114
|
metadata = {
|
|
90
115
|
"label": "mercury",
|
|
91
|
-
"source": "
|
|
116
|
+
"source": "Elfsong/Mercury",
|
|
117
|
+
"slug_name": slug_name,
|
|
118
|
+
"difficulty": difficulty,
|
|
119
|
+
"fastest_runtime": fastest.get("runtime", ""),
|
|
120
|
+
"slowest_runtime": slowest.get("runtime", ""),
|
|
121
|
+
"is_code_efficiency_benchmark": True,
|
|
92
122
|
}
|
|
93
123
|
|
|
94
124
|
return self._build_pair(
|
|
95
125
|
question=formatted_question,
|
|
96
|
-
correct=
|
|
97
|
-
incorrect=
|
|
126
|
+
correct=f"```python\n{fastest_code}\n```",
|
|
127
|
+
incorrect=f"```python\n{slowest_code}\n```",
|
|
98
128
|
metadata=metadata,
|
|
99
129
|
)
|
|
100
130
|
|
|
101
131
|
except Exception as exc:
|
|
102
|
-
log.error(f"Error extracting pair
|
|
132
|
+
log.error(f"Error extracting Mercury pair: {exc}", exc_info=True)
|
|
103
133
|
return None
|
|
104
134
|
|
|
105
|
-
def _create_incorrect_answer(self, correct: str) -> str:
|
|
106
|
-
"""Create an incorrect answer by modifying the correct one."""
|
|
107
|
-
# For code, corrupt it slightly
|
|
108
|
-
if len(correct) > 10:
|
|
109
|
-
return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
|
|
110
|
-
return f"{correct} # INCORRECT"
|
|
111
|
-
|
|
@@ -111,18 +111,8 @@ class OlympiadBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
111
111
|
)
|
|
112
112
|
log.info(f"Loaded {len(docs)} examples from OlympiadBench ({self.config})")
|
|
113
113
|
except Exception as e:
|
|
114
|
-
log.
|
|
115
|
-
|
|
116
|
-
try:
|
|
117
|
-
docs = self.load_dataset(
|
|
118
|
-
dataset_name="lmms-lab/OlympiadBench",
|
|
119
|
-
split="test",
|
|
120
|
-
limit=max_items,
|
|
121
|
-
)
|
|
122
|
-
log.info(f"Loaded {len(docs)} examples from lmms-lab/OlympiadBench")
|
|
123
|
-
except Exception as e2:
|
|
124
|
-
log.error(f"Failed to load any OlympiadBench: {e2}")
|
|
125
|
-
return []
|
|
114
|
+
log.error(f"Failed to load Hothan/OlympiadBench: {e}")
|
|
115
|
+
return []
|
|
126
116
|
|
|
127
117
|
pairs: list[ContrastivePair] = []
|
|
128
118
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from typing import Any
|
|
4
|
+
from datasets import load_dataset
|
|
4
5
|
from wisent.core.cli_logger import setup_logger
|
|
5
6
|
|
|
6
7
|
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
@@ -10,73 +11,61 @@ __all__ = ["PlanBenchExtractor"]
|
|
|
10
11
|
|
|
11
12
|
log = setup_logger(__name__)
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
"plan_generation", # Generate a valid plan
|
|
22
|
-
"cost_optimal_planning", # Generate cost-optimal plan
|
|
23
|
-
"plan_verification", # Verify if a plan is valid
|
|
24
|
-
"goal_recognition", # Recognize the goal from actions
|
|
25
|
-
"plan_execution_reasoning", # Predict outcome of action execution
|
|
26
|
-
"action_reordering", # Reorder actions for valid plan
|
|
14
|
+
PLANBENCH_CONFIGS = [
|
|
15
|
+
"task_1_plan_generation",
|
|
16
|
+
"task_2_plan_optimality",
|
|
17
|
+
"task_3_plan_verification",
|
|
18
|
+
"task_5_plan_generalization",
|
|
19
|
+
"task_7_plan_execution",
|
|
20
|
+
"task_8_1_goal_shuffling",
|
|
21
|
+
"task_8_2_full_to_partial",
|
|
27
22
|
]
|
|
28
23
|
|
|
29
24
|
|
|
30
25
|
class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
31
26
|
"""
|
|
32
|
-
Extractor for PlanBench - Planning and Reasoning Benchmark.
|
|
27
|
+
Extractor for PlanBench - Planning and Reasoning Benchmark (NeurIPS 2023).
|
|
33
28
|
|
|
34
29
|
PlanBench evaluates LLMs on planning and reasoning about actions
|
|
35
30
|
and change, using domains from the International Planning Competition.
|
|
36
31
|
|
|
37
|
-
|
|
38
|
-
- Blocksworld: Classic blocks stacking problems
|
|
39
|
-
- Logistics: Package delivery with trucks and planes
|
|
40
|
-
|
|
41
|
-
Task Types:
|
|
42
|
-
- Plan generation and cost-optimal planning
|
|
43
|
-
- Plan verification
|
|
44
|
-
- Goal recognition
|
|
45
|
-
- Plan execution reasoning
|
|
46
|
-
- Action reordering
|
|
32
|
+
Dataset: tasksource/planbench (HuggingFace)
|
|
47
33
|
|
|
48
|
-
|
|
34
|
+
Available configs:
|
|
35
|
+
- task_1_plan_generation: Generate a valid plan
|
|
36
|
+
- task_2_plan_optimality: Generate cost-optimal plan
|
|
37
|
+
- task_3_plan_verification: Verify if a plan is valid
|
|
38
|
+
- task_5_plan_generalization: Generalize plan to new instances
|
|
39
|
+
- task_7_plan_execution: Predict execution outcome
|
|
40
|
+
- task_8_1_goal_shuffling: Handle shuffled goals
|
|
41
|
+
- task_8_2_full_to_partial: Full to partial observability
|
|
49
42
|
|
|
50
43
|
For planning evaluation:
|
|
51
|
-
- Positive (correct) = Valid plan
|
|
52
|
-
- Negative (incorrect) = Invalid
|
|
44
|
+
- Positive (correct) = Valid plan matching ground truth
|
|
45
|
+
- Negative (incorrect) = Invalid or wrong plan
|
|
53
46
|
"""
|
|
54
47
|
|
|
55
|
-
# Evaluator that should be used for this benchmark
|
|
56
48
|
evaluator_name = "planning_reasoning"
|
|
57
49
|
|
|
58
|
-
def __init__(self,
|
|
50
|
+
def __init__(self, config: str = "task_1_plan_generation"):
|
|
59
51
|
"""
|
|
60
52
|
Initialize PlanBench extractor.
|
|
61
53
|
|
|
62
54
|
Args:
|
|
63
|
-
|
|
64
|
-
task: Task type (e.g., "plan_generation", "plan_verification")
|
|
55
|
+
config: PlanBench task config (default: task_1_plan_generation)
|
|
65
56
|
"""
|
|
66
57
|
super().__init__()
|
|
67
|
-
|
|
68
|
-
|
|
58
|
+
if config not in PLANBENCH_CONFIGS:
|
|
59
|
+
log.warning(f"Unknown config '{config}', using task_1_plan_generation")
|
|
60
|
+
config = "task_1_plan_generation"
|
|
61
|
+
self.config = config
|
|
69
62
|
|
|
70
63
|
def extract_contrastive_pairs(
|
|
71
64
|
self,
|
|
72
65
|
limit: int | None = None,
|
|
73
66
|
) -> list[ContrastivePair]:
|
|
74
67
|
"""
|
|
75
|
-
Build contrastive pairs from PlanBench
|
|
76
|
-
|
|
77
|
-
Creates pairs for planning evaluation:
|
|
78
|
-
- Positive (correct) = Valid planning solution
|
|
79
|
-
- Negative (incorrect) = Invalid planning solution
|
|
68
|
+
Build contrastive pairs from PlanBench.
|
|
80
69
|
|
|
81
70
|
Args:
|
|
82
71
|
limit: Optional maximum number of pairs to produce.
|
|
@@ -86,9 +75,8 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
86
75
|
"""
|
|
87
76
|
max_items = self._normalize_limit(limit)
|
|
88
77
|
|
|
89
|
-
|
|
90
|
-
docs
|
|
91
|
-
log.info(f"Created {len(docs)} PlanBench examples ({self.domain}, {self.task})")
|
|
78
|
+
docs = self._load_planbench_data()
|
|
79
|
+
log.info(f"Loaded {len(docs)} PlanBench examples (config: {self.config})")
|
|
92
80
|
|
|
93
81
|
pairs: list[ContrastivePair] = []
|
|
94
82
|
|
|
@@ -104,161 +92,60 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
104
92
|
|
|
105
93
|
return pairs
|
|
106
94
|
|
|
107
|
-
def
|
|
108
|
-
"""
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
"valid_plan": [
|
|
127
|
-
"1. Unstack B from A",
|
|
128
|
-
"2. Put B on C",
|
|
129
|
-
"3. Pick up A",
|
|
130
|
-
"4. Stack A on B",
|
|
131
|
-
],
|
|
132
|
-
"invalid_plan": [
|
|
133
|
-
"1. Pick up A", # Invalid - B is on A
|
|
134
|
-
"2. Put A on B",
|
|
135
|
-
],
|
|
136
|
-
},
|
|
137
|
-
{
|
|
138
|
-
"initial_state": "Block A is on Block B. Block B is on the table. Block C is on the table. The robot arm is empty.",
|
|
139
|
-
"goal_state": "Block B is on Block A. Block C is on Block B.",
|
|
140
|
-
"valid_plan": [
|
|
141
|
-
"1. Unstack A from B",
|
|
142
|
-
"2. Put A on the table",
|
|
143
|
-
"3. Pick up B",
|
|
144
|
-
"4. Stack B on A",
|
|
145
|
-
"5. Pick up C",
|
|
146
|
-
"6. Stack C on B",
|
|
147
|
-
],
|
|
148
|
-
"invalid_plan": [
|
|
149
|
-
"1. Stack B on A", # Invalid - A is on B
|
|
150
|
-
],
|
|
151
|
-
},
|
|
152
|
-
{
|
|
153
|
-
"initial_state": "Block A, B, and C are on the table. Block D is on Block A.",
|
|
154
|
-
"goal_state": "Block A is on Block B. Block B is on Block C. Block D is on Block A.",
|
|
155
|
-
"valid_plan": [
|
|
156
|
-
"1. Unstack D from A",
|
|
157
|
-
"2. Put D on table",
|
|
158
|
-
"3. Pick up B",
|
|
159
|
-
"4. Stack B on C",
|
|
160
|
-
"5. Pick up A",
|
|
161
|
-
"6. Stack A on B",
|
|
162
|
-
"7. Pick up D",
|
|
163
|
-
"8. Stack D on A",
|
|
164
|
-
],
|
|
165
|
-
"invalid_plan": [
|
|
166
|
-
"1. Pick up A", # Invalid - D is on A
|
|
167
|
-
],
|
|
168
|
-
},
|
|
169
|
-
]
|
|
170
|
-
|
|
171
|
-
examples = []
|
|
172
|
-
for i in range(count):
|
|
173
|
-
case = blocksworld_cases[i % len(blocksworld_cases)].copy()
|
|
174
|
-
case["case_id"] = f"blocks_{i:03d}"
|
|
175
|
-
case["domain"] = "blocksworld"
|
|
176
|
-
examples.append(case)
|
|
177
|
-
|
|
178
|
-
return examples
|
|
179
|
-
|
|
180
|
-
def _create_logistics_examples(self, count: int) -> list[dict[str, Any]]:
|
|
181
|
-
"""Create logistics planning examples."""
|
|
182
|
-
logistics_cases = [
|
|
183
|
-
{
|
|
184
|
-
"initial_state": "Package P1 is in City A. Truck T1 is in City A. Package needs to go to City B.",
|
|
185
|
-
"goal_state": "Package P1 is in City B.",
|
|
186
|
-
"valid_plan": [
|
|
187
|
-
"1. Load P1 onto T1 in City A",
|
|
188
|
-
"2. Drive T1 from City A to City B",
|
|
189
|
-
"3. Unload P1 from T1 in City B",
|
|
190
|
-
],
|
|
191
|
-
"invalid_plan": [
|
|
192
|
-
"1. Drive T1 from City A to City B",
|
|
193
|
-
"2. Unload P1 from T1", # Invalid - P1 was never loaded
|
|
194
|
-
],
|
|
195
|
-
},
|
|
196
|
-
{
|
|
197
|
-
"initial_state": "Package P1 is in City A. Package P2 is in City B. Plane A1 is in City A. Goal: P1 in City C, P2 in City A.",
|
|
198
|
-
"goal_state": "Package P1 is in City C. Package P2 is in City A.",
|
|
199
|
-
"valid_plan": [
|
|
200
|
-
"1. Load P1 onto Plane A1 in City A",
|
|
201
|
-
"2. Fly A1 from City A to City B",
|
|
202
|
-
"3. Load P2 onto A1 in City B",
|
|
203
|
-
"4. Fly A1 from City B to City A",
|
|
204
|
-
"5. Unload P2 in City A",
|
|
205
|
-
"6. Fly A1 from City A to City C",
|
|
206
|
-
"7. Unload P1 in City C",
|
|
207
|
-
],
|
|
208
|
-
"invalid_plan": [
|
|
209
|
-
"1. Fly A1 to City B",
|
|
210
|
-
"2. Unload P1", # P1 was never loaded
|
|
211
|
-
],
|
|
212
|
-
},
|
|
213
|
-
]
|
|
214
|
-
|
|
215
|
-
examples = []
|
|
216
|
-
for i in range(count):
|
|
217
|
-
case = logistics_cases[i % len(logistics_cases)].copy()
|
|
218
|
-
case["case_id"] = f"logistics_{i:03d}"
|
|
219
|
-
case["domain"] = "logistics"
|
|
220
|
-
examples.append(case)
|
|
221
|
-
|
|
222
|
-
return examples
|
|
95
|
+
def _load_planbench_data(self) -> list[dict[str, Any]]:
|
|
96
|
+
"""Load PlanBench data from HuggingFace."""
|
|
97
|
+
try:
|
|
98
|
+
ds = load_dataset("tasksource/planbench", self.config, split="train")
|
|
99
|
+
examples = []
|
|
100
|
+
for i, item in enumerate(ds):
|
|
101
|
+
examples.append({
|
|
102
|
+
"case_id": f"planbench_{self.config}_{i:04d}",
|
|
103
|
+
"task": item.get("task", ""),
|
|
104
|
+
"prompt_type": item.get("prompt_type", ""),
|
|
105
|
+
"domain": item.get("domain", ""),
|
|
106
|
+
"instance_id": item.get("instance_id", ""),
|
|
107
|
+
"query": item.get("query", ""),
|
|
108
|
+
"ground_truth_plan": item.get("ground_truth_plan", ""),
|
|
109
|
+
})
|
|
110
|
+
return examples
|
|
111
|
+
except Exception as e:
|
|
112
|
+
log.error(f"Failed to load PlanBench from HuggingFace: {e}")
|
|
113
|
+
raise RuntimeError(f"Cannot load PlanBench data: {e}")
|
|
223
114
|
|
|
224
115
|
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
225
116
|
"""
|
|
226
117
|
Convert a single doc into a ContrastivePair.
|
|
118
|
+
|
|
119
|
+
PlanBench HuggingFace format:
|
|
120
|
+
{"task": "task_1_plan_generation", "prompt_type": "oneshot", "domain": "...",
|
|
121
|
+
"instance_id": 2, "query": "...", "ground_truth_plan": "..."}
|
|
227
122
|
"""
|
|
228
123
|
try:
|
|
229
124
|
case_id = doc.get("case_id", "")
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
domain = doc.get("domain", self.domain)
|
|
235
|
-
|
|
236
|
-
if not initial_state or not goal_state:
|
|
237
|
-
log.debug("Skipping: missing states")
|
|
238
|
-
return None
|
|
125
|
+
query = doc.get("query", "").strip()
|
|
126
|
+
ground_truth_plan = doc.get("ground_truth_plan", "").strip()
|
|
127
|
+
domain = doc.get("domain", "")
|
|
128
|
+
task = doc.get("task", "")
|
|
239
129
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
)
|
|
130
|
+
if not query or not ground_truth_plan:
|
|
131
|
+
log.debug("Skipping: missing query or ground_truth_plan")
|
|
132
|
+
return None
|
|
244
133
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
# Negative = invalid plan
|
|
248
|
-
incorrect_response = self._create_invalid_plan_response(invalid_plan)
|
|
134
|
+
correct_response = self._create_correct_response(ground_truth_plan)
|
|
135
|
+
incorrect_response = self._create_incorrect_response(ground_truth_plan)
|
|
249
136
|
|
|
250
137
|
metadata = {
|
|
251
138
|
"label": "planbench",
|
|
252
|
-
"source": "
|
|
139
|
+
"source": "tasksource/planbench",
|
|
253
140
|
"case_id": case_id,
|
|
254
141
|
"domain": domain,
|
|
255
|
-
"task":
|
|
256
|
-
"
|
|
142
|
+
"task": task,
|
|
143
|
+
"config": self.config,
|
|
257
144
|
"is_planning_benchmark": True,
|
|
258
145
|
}
|
|
259
146
|
|
|
260
147
|
return self._build_pair(
|
|
261
|
-
question=
|
|
148
|
+
question=query,
|
|
262
149
|
correct=correct_response,
|
|
263
150
|
incorrect=incorrect_response,
|
|
264
151
|
metadata=metadata,
|
|
@@ -268,50 +155,22 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
268
155
|
log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
|
|
269
156
|
return None
|
|
270
157
|
|
|
271
|
-
def
|
|
272
|
-
|
|
273
|
-
) -> str:
|
|
274
|
-
"""Build the planning task prompt."""
|
|
275
|
-
domain_desc = ""
|
|
276
|
-
if domain == "blocksworld":
|
|
277
|
-
domain_desc = (
|
|
278
|
-
"In this blocks world domain, you can:\n"
|
|
279
|
-
"- Pick up a block (only if nothing is on it and arm is empty)\n"
|
|
280
|
-
"- Put down a block on the table\n"
|
|
281
|
-
"- Stack a block on another (only if target block is clear)\n"
|
|
282
|
-
"- Unstack a block from another\n\n"
|
|
283
|
-
)
|
|
284
|
-
elif domain == "logistics":
|
|
285
|
-
domain_desc = (
|
|
286
|
-
"In this logistics domain, you can:\n"
|
|
287
|
-
"- Load packages onto trucks/planes (at same location)\n"
|
|
288
|
-
"- Unload packages from trucks/planes\n"
|
|
289
|
-
"- Drive trucks between locations in same city\n"
|
|
290
|
-
"- Fly planes between cities\n\n"
|
|
291
|
-
)
|
|
292
|
-
|
|
158
|
+
def _create_correct_response(self, ground_truth_plan: str) -> str:
|
|
159
|
+
"""Create a response with the correct plan."""
|
|
293
160
|
return (
|
|
294
|
-
f"{
|
|
295
|
-
|
|
296
|
-
f"Goal State:\n{goal_state}\n\n"
|
|
297
|
-
"Generate a valid sequence of actions to achieve the goal state from "
|
|
298
|
-
"the initial state. Ensure each action's preconditions are satisfied."
|
|
161
|
+
f"Here is the plan to achieve the goal:\n\n{ground_truth_plan}\n\n"
|
|
162
|
+
"Each action in this sequence has its preconditions satisfied."
|
|
299
163
|
)
|
|
300
164
|
|
|
301
|
-
def
|
|
302
|
-
"""Create
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
def _create_invalid_plan_response(self, plan: list[str]) -> str:
|
|
311
|
-
"""Create a response with an invalid plan."""
|
|
312
|
-
plan_str = "\n".join(plan) if plan else "1. [Incomplete plan]"
|
|
165
|
+
def _create_incorrect_response(self, ground_truth_plan: str) -> str:
|
|
166
|
+
"""Create an incorrect response (wrong/incomplete plan)."""
|
|
167
|
+
lines = ground_truth_plan.strip().split("\n")
|
|
168
|
+
if len(lines) > 1:
|
|
169
|
+
wrong_plan = "\n".join(reversed(lines[:2]))
|
|
170
|
+
else:
|
|
171
|
+
wrong_plan = "(noop)"
|
|
313
172
|
return (
|
|
314
|
-
f"Here's my plan:\n\n{
|
|
173
|
+
f"Here's my plan:\n\n{wrong_plan}\n\n"
|
|
315
174
|
"This should work to reach the goal."
|
|
316
175
|
)
|
|
317
176
|
|
|
@@ -130,13 +130,46 @@ class PolyMathExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
130
130
|
return None
|
|
131
131
|
|
|
132
132
|
def _create_incorrect_answer(self, correct: str) -> str:
|
|
133
|
-
"""Create
|
|
133
|
+
"""Create a meaningful incorrect answer using plausible wrong values."""
|
|
134
|
+
import random
|
|
135
|
+
import re
|
|
136
|
+
random.seed(hash(correct) % (2**32))
|
|
137
|
+
|
|
138
|
+
# Try symbolic parsing first
|
|
134
139
|
try:
|
|
135
140
|
parsed_correct = parse_latex(correct)
|
|
136
|
-
|
|
137
|
-
|
|
141
|
+
transforms = [
|
|
142
|
+
parsed_correct * 2,
|
|
143
|
+
parsed_correct / 2,
|
|
144
|
+
parsed_correct - 1,
|
|
145
|
+
-parsed_correct,
|
|
146
|
+
]
|
|
147
|
+
wrong = random.choice(transforms)
|
|
148
|
+
return str(latex(wrong))
|
|
138
149
|
except Exception:
|
|
139
|
-
|
|
150
|
+
pass
|
|
151
|
+
|
|
152
|
+
# Try simple integer
|
|
153
|
+
try:
|
|
154
|
+
clean = correct.replace('$', '').replace(',', '').strip()
|
|
155
|
+
num = int(clean)
|
|
156
|
+
wrong_vals = [num * 2, num // 2 if num > 1 else num * 3, num - 1, -num]
|
|
157
|
+
return str(random.choice(wrong_vals))
|
|
158
|
+
except ValueError:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
# For fractions
|
|
162
|
+
frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
|
|
163
|
+
if frac_match:
|
|
164
|
+
n, d = int(frac_match.group(1)), int(frac_match.group(2))
|
|
165
|
+
return random.choice([f"\\frac{{{d}}}{{{n}}}", f"\\frac{{{n*2}}}{{{d}}}"])
|
|
166
|
+
|
|
167
|
+
# For pi expressions
|
|
168
|
+
if '\\pi' in correct:
|
|
169
|
+
return correct.replace('\\pi', '2\\pi') if '2\\pi' not in correct else correct.replace('2\\pi', '\\pi')
|
|
170
|
+
|
|
171
|
+
# Fallback
|
|
172
|
+
return random.choice(['0', '1', '-1', '2'])
|
|
140
173
|
|
|
141
174
|
|
|
142
175
|
@staticmethod
|