wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +9 -5
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +256 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +5 -7
- wisent/core/cli/agent/train_classifier.py +19 -7
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -2,15 +2,22 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import Any
|
|
4
4
|
from wisent.core.cli_logger import setup_logger
|
|
5
|
+
import requests
|
|
6
|
+
import zipfile
|
|
7
|
+
import json
|
|
8
|
+
import io
|
|
5
9
|
|
|
6
10
|
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
7
11
|
from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
|
|
8
12
|
from wisent.core.errors import InvalidValueError
|
|
9
13
|
|
|
10
|
-
__all__ = ["OJBenchExtractor", "
|
|
14
|
+
__all__ = ["OJBenchExtractor", "NL2BashExtractor", "SciCodeExtractor"]
|
|
11
15
|
|
|
12
16
|
log = setup_logger(__name__)
|
|
13
17
|
|
|
18
|
+
# GitHub URL for SciCode data
|
|
19
|
+
SCICODE_GITHUB_URL = "https://raw.githubusercontent.com/scicode-bench/scicode-bench.github.io/main/data/data.zip"
|
|
20
|
+
|
|
14
21
|
|
|
15
22
|
class OJBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
16
23
|
"""
|
|
@@ -66,9 +73,9 @@ class OJBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
66
73
|
)
|
|
67
74
|
log.info(f"Loaded {len(docs)} examples from code_contests")
|
|
68
75
|
except Exception as e:
|
|
69
|
-
log.
|
|
70
|
-
|
|
71
|
-
|
|
76
|
+
log.error(f"Failed to load code_contests dataset: {e}")
|
|
77
|
+
log.error("OJBench requires deepmind/code_contests dataset. No synthetic data available.")
|
|
78
|
+
return []
|
|
72
79
|
|
|
73
80
|
pairs: list[ContrastivePair] = []
|
|
74
81
|
|
|
@@ -84,256 +91,6 @@ class OJBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
84
91
|
|
|
85
92
|
return pairs
|
|
86
93
|
|
|
87
|
-
def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
|
|
88
|
-
"""Create synthetic competitive programming examples."""
|
|
89
|
-
examples = [
|
|
90
|
-
{
|
|
91
|
-
"description": """Problem: Two Sum
|
|
92
|
-
Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.
|
|
93
|
-
|
|
94
|
-
Input: First line contains n (1 ≤ n ≤ 10^5) and target. Second line contains n space-separated integers.
|
|
95
|
-
Output: Two indices (0-indexed) separated by space.
|
|
96
|
-
|
|
97
|
-
Example:
|
|
98
|
-
Input:
|
|
99
|
-
4 9
|
|
100
|
-
2 7 11 15
|
|
101
|
-
Output:
|
|
102
|
-
0 1""",
|
|
103
|
-
"correct_solution": """#include <bits/stdc++.h>
|
|
104
|
-
using namespace std;
|
|
105
|
-
|
|
106
|
-
int main() {
|
|
107
|
-
ios::sync_with_stdio(false);
|
|
108
|
-
cin.tie(nullptr);
|
|
109
|
-
|
|
110
|
-
int n, target;
|
|
111
|
-
cin >> n >> target;
|
|
112
|
-
|
|
113
|
-
vector<int> nums(n);
|
|
114
|
-
unordered_map<int, int> mp;
|
|
115
|
-
|
|
116
|
-
for (int i = 0; i < n; i++) {
|
|
117
|
-
cin >> nums[i];
|
|
118
|
-
int complement = target - nums[i];
|
|
119
|
-
if (mp.count(complement)) {
|
|
120
|
-
cout << mp[complement] << " " << i << endl;
|
|
121
|
-
return 0;
|
|
122
|
-
}
|
|
123
|
-
mp[nums[i]] = i;
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
return 0;
|
|
127
|
-
}""",
|
|
128
|
-
"incorrect_solution": """#include <bits/stdc++.h>
|
|
129
|
-
using namespace std;
|
|
130
|
-
|
|
131
|
-
int main() {
|
|
132
|
-
int n, target;
|
|
133
|
-
cin >> n >> target;
|
|
134
|
-
|
|
135
|
-
vector<int> nums(n);
|
|
136
|
-
for (int i = 0; i < n; i++) cin >> nums[i];
|
|
137
|
-
|
|
138
|
-
// O(n^2) - will TLE on large inputs
|
|
139
|
-
for (int i = 0; i < n; i++) {
|
|
140
|
-
for (int j = 0; j < n; j++) { // Bug: should start from i+1
|
|
141
|
-
if (nums[i] + nums[j] == target) {
|
|
142
|
-
cout << i << " " << j << endl;
|
|
143
|
-
return 0;
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
return 0;
|
|
148
|
-
}""",
|
|
149
|
-
"difficulty": "easy",
|
|
150
|
-
},
|
|
151
|
-
{
|
|
152
|
-
"description": """Problem: Maximum Subarray Sum
|
|
153
|
-
Find the contiguous subarray with the largest sum.
|
|
154
|
-
|
|
155
|
-
Input: First line contains n (1 ≤ n ≤ 10^6). Second line contains n integers (-10^9 ≤ a[i] ≤ 10^9).
|
|
156
|
-
Output: Maximum subarray sum.
|
|
157
|
-
|
|
158
|
-
Example:
|
|
159
|
-
Input:
|
|
160
|
-
8
|
|
161
|
-
-2 1 -3 4 -1 2 1 -5 4
|
|
162
|
-
Output:
|
|
163
|
-
6""",
|
|
164
|
-
"correct_solution": """#include <bits/stdc++.h>
|
|
165
|
-
using namespace std;
|
|
166
|
-
|
|
167
|
-
int main() {
|
|
168
|
-
ios::sync_with_stdio(false);
|
|
169
|
-
cin.tie(nullptr);
|
|
170
|
-
|
|
171
|
-
int n;
|
|
172
|
-
cin >> n;
|
|
173
|
-
|
|
174
|
-
long long maxSum = LLONG_MIN;
|
|
175
|
-
long long currentSum = 0;
|
|
176
|
-
|
|
177
|
-
for (int i = 0; i < n; i++) {
|
|
178
|
-
long long x;
|
|
179
|
-
cin >> x;
|
|
180
|
-
currentSum = max(x, currentSum + x);
|
|
181
|
-
maxSum = max(maxSum, currentSum);
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
cout << maxSum << endl;
|
|
185
|
-
return 0;
|
|
186
|
-
}""",
|
|
187
|
-
"incorrect_solution": """#include <bits/stdc++.h>
|
|
188
|
-
using namespace std;
|
|
189
|
-
|
|
190
|
-
int main() {
|
|
191
|
-
int n;
|
|
192
|
-
cin >> n;
|
|
193
|
-
|
|
194
|
-
vector<int> a(n);
|
|
195
|
-
for (int i = 0; i < n; i++) cin >> a[i];
|
|
196
|
-
|
|
197
|
-
int maxSum = 0; // Bug: should be LLONG_MIN for negative arrays
|
|
198
|
-
int currentSum = 0;
|
|
199
|
-
|
|
200
|
-
for (int i = 0; i < n; i++) {
|
|
201
|
-
currentSum += a[i]; // Bug: doesn't handle Kadane's algorithm correctly
|
|
202
|
-
if (currentSum > maxSum) maxSum = currentSum;
|
|
203
|
-
if (currentSum < 0) currentSum = 0;
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
cout << maxSum << endl;
|
|
207
|
-
return 0;
|
|
208
|
-
}""",
|
|
209
|
-
"difficulty": "medium",
|
|
210
|
-
},
|
|
211
|
-
{
|
|
212
|
-
"description": """Problem: Segment Tree Range Sum
|
|
213
|
-
Given an array, support two operations:
|
|
214
|
-
1. Update a[i] = x
|
|
215
|
-
2. Query sum(l, r)
|
|
216
|
-
|
|
217
|
-
Input: First line n, q. Second line is initial array. Next q lines are operations.
|
|
218
|
-
Output: Answer for each query operation.
|
|
219
|
-
|
|
220
|
-
Example:
|
|
221
|
-
Input:
|
|
222
|
-
5 3
|
|
223
|
-
1 2 3 4 5
|
|
224
|
-
2 1 3
|
|
225
|
-
1 2 10
|
|
226
|
-
2 1 3
|
|
227
|
-
Output:
|
|
228
|
-
6
|
|
229
|
-
14""",
|
|
230
|
-
"correct_solution": """#include <bits/stdc++.h>
|
|
231
|
-
using namespace std;
|
|
232
|
-
|
|
233
|
-
class SegmentTree {
|
|
234
|
-
vector<long long> tree;
|
|
235
|
-
int n;
|
|
236
|
-
|
|
237
|
-
public:
|
|
238
|
-
SegmentTree(vector<int>& arr) {
|
|
239
|
-
n = arr.size();
|
|
240
|
-
tree.resize(4 * n);
|
|
241
|
-
build(arr, 1, 0, n - 1);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
void build(vector<int>& arr, int v, int tl, int tr) {
|
|
245
|
-
if (tl == tr) {
|
|
246
|
-
tree[v] = arr[tl];
|
|
247
|
-
} else {
|
|
248
|
-
int tm = (tl + tr) / 2;
|
|
249
|
-
build(arr, 2*v, tl, tm);
|
|
250
|
-
build(arr, 2*v+1, tm+1, tr);
|
|
251
|
-
tree[v] = tree[2*v] + tree[2*v+1];
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
void update(int v, int tl, int tr, int pos, int val) {
|
|
256
|
-
if (tl == tr) {
|
|
257
|
-
tree[v] = val;
|
|
258
|
-
} else {
|
|
259
|
-
int tm = (tl + tr) / 2;
|
|
260
|
-
if (pos <= tm) update(2*v, tl, tm, pos, val);
|
|
261
|
-
else update(2*v+1, tm+1, tr, pos, val);
|
|
262
|
-
tree[v] = tree[2*v] + tree[2*v+1];
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
long long query(int v, int tl, int tr, int l, int r) {
|
|
267
|
-
if (l > r) return 0;
|
|
268
|
-
if (l == tl && r == tr) return tree[v];
|
|
269
|
-
int tm = (tl + tr) / 2;
|
|
270
|
-
return query(2*v, tl, tm, l, min(r, tm)) +
|
|
271
|
-
query(2*v+1, tm+1, tr, max(l, tm+1), r);
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
void update(int pos, int val) { update(1, 0, n-1, pos, val); }
|
|
275
|
-
long long query(int l, int r) { return query(1, 0, n-1, l, r); }
|
|
276
|
-
};
|
|
277
|
-
|
|
278
|
-
int main() {
|
|
279
|
-
ios::sync_with_stdio(false);
|
|
280
|
-
cin.tie(nullptr);
|
|
281
|
-
|
|
282
|
-
int n, q;
|
|
283
|
-
cin >> n >> q;
|
|
284
|
-
|
|
285
|
-
vector<int> a(n);
|
|
286
|
-
for (int i = 0; i < n; i++) cin >> a[i];
|
|
287
|
-
|
|
288
|
-
SegmentTree st(a);
|
|
289
|
-
|
|
290
|
-
while (q--) {
|
|
291
|
-
int type, x, y;
|
|
292
|
-
cin >> type >> x >> y;
|
|
293
|
-
if (type == 1) {
|
|
294
|
-
st.update(x - 1, y);
|
|
295
|
-
} else {
|
|
296
|
-
cout << st.query(x - 1, y - 1) << "\\n";
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
return 0;
|
|
301
|
-
}""",
|
|
302
|
-
"incorrect_solution": """#include <bits/stdc++.h>
|
|
303
|
-
using namespace std;
|
|
304
|
-
|
|
305
|
-
int main() {
|
|
306
|
-
int n, q;
|
|
307
|
-
cin >> n >> q;
|
|
308
|
-
|
|
309
|
-
vector<int> a(n);
|
|
310
|
-
for (int i = 0; i < n; i++) cin >> a[i];
|
|
311
|
-
|
|
312
|
-
// O(n) per query - will TLE
|
|
313
|
-
while (q--) {
|
|
314
|
-
int type, x, y;
|
|
315
|
-
cin >> type >> x >> y;
|
|
316
|
-
if (type == 1) {
|
|
317
|
-
a[x-1] = y;
|
|
318
|
-
} else {
|
|
319
|
-
int sum = 0;
|
|
320
|
-
for (int i = x-1; i < y; i++) sum += a[i];
|
|
321
|
-
cout << sum << "\\n";
|
|
322
|
-
}
|
|
323
|
-
}
|
|
324
|
-
return 0;
|
|
325
|
-
}""",
|
|
326
|
-
"difficulty": "hard",
|
|
327
|
-
},
|
|
328
|
-
]
|
|
329
|
-
|
|
330
|
-
result = []
|
|
331
|
-
for i in range(count):
|
|
332
|
-
example = examples[i % len(examples)].copy()
|
|
333
|
-
result.append(example)
|
|
334
|
-
|
|
335
|
-
return result
|
|
336
|
-
|
|
337
94
|
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
338
95
|
"""Convert a single doc into a ContrastivePair."""
|
|
339
96
|
try:
|
|
@@ -436,38 +193,30 @@ int main() {
|
|
|
436
193
|
|
|
437
194
|
|
|
438
195
|
|
|
439
|
-
class
|
|
196
|
+
class NL2BashExtractor(HuggingFaceBenchmarkExtractor):
|
|
440
197
|
"""
|
|
441
|
-
Extractor for
|
|
198
|
+
Extractor for NL2Bash - Natural Language to Bash command generation.
|
|
442
199
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
200
|
+
Dataset: jiacheng-ye/nl2bash on HuggingFace
|
|
201
|
+
|
|
202
|
+
NL2Bash evaluates LLMs' ability to translate natural language descriptions
|
|
203
|
+
into correct Bash shell commands. Tests command syntax, flag usage,
|
|
204
|
+
and understanding of CLI tools.
|
|
446
205
|
|
|
447
|
-
For
|
|
448
|
-
- Positive (correct) = Correct
|
|
449
|
-
- Negative (incorrect) =
|
|
206
|
+
For bash command generation evaluation:
|
|
207
|
+
- Positive (correct) = Correct bash command with proper syntax
|
|
208
|
+
- Negative (incorrect) = Command with errors, wrong syntax, or missing parts
|
|
450
209
|
"""
|
|
451
210
|
|
|
452
211
|
# Evaluator that should be used for this benchmark
|
|
453
|
-
evaluator_name = "
|
|
454
|
-
|
|
455
|
-
def __init__(self, os_type: str = "linux"):
|
|
456
|
-
"""
|
|
457
|
-
Initialize Terminal-Bench extractor.
|
|
458
|
-
|
|
459
|
-
Args:
|
|
460
|
-
os_type: Operating system type (linux, macos, windows)
|
|
461
|
-
"""
|
|
462
|
-
super().__init__()
|
|
463
|
-
self.os_type = os_type
|
|
212
|
+
evaluator_name = "bash_generation"
|
|
464
213
|
|
|
465
214
|
def extract_contrastive_pairs(
|
|
466
215
|
self,
|
|
467
216
|
limit: int | None = None,
|
|
468
217
|
) -> list[ContrastivePair]:
|
|
469
218
|
"""
|
|
470
|
-
Build contrastive pairs from
|
|
219
|
+
Build contrastive pairs from NL2Bash dataset.
|
|
471
220
|
|
|
472
221
|
Args:
|
|
473
222
|
limit: Optional maximum number of pairs to produce.
|
|
@@ -477,20 +226,16 @@ class TerminalBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
477
226
|
"""
|
|
478
227
|
max_items = self._normalize_limit(limit)
|
|
479
228
|
|
|
480
|
-
# Try loading NL2Bash dataset
|
|
481
|
-
docs = []
|
|
482
|
-
|
|
483
229
|
try:
|
|
484
230
|
docs = self.load_dataset(
|
|
485
231
|
dataset_name="jiacheng-ye/nl2bash",
|
|
486
232
|
split="test",
|
|
487
|
-
limit=max_items
|
|
233
|
+
limit=max_items,
|
|
488
234
|
)
|
|
489
235
|
log.info(f"Loaded {len(docs)} examples from nl2bash")
|
|
490
236
|
except Exception as e:
|
|
491
|
-
log.
|
|
492
|
-
|
|
493
|
-
docs = self._create_synthetic_examples(max_items or 100)
|
|
237
|
+
log.error(f"Failed to load nl2bash dataset: {e}")
|
|
238
|
+
return []
|
|
494
239
|
|
|
495
240
|
pairs: list[ContrastivePair] = []
|
|
496
241
|
|
|
@@ -502,116 +247,40 @@ class TerminalBenchExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
502
247
|
break
|
|
503
248
|
|
|
504
249
|
if not pairs:
|
|
505
|
-
log.warning("No valid
|
|
250
|
+
log.warning("No valid NL2Bash pairs extracted")
|
|
506
251
|
|
|
507
252
|
return pairs
|
|
508
253
|
|
|
509
|
-
def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
|
|
510
|
-
"""Create synthetic terminal interaction examples."""
|
|
511
|
-
examples = [
|
|
512
|
-
{
|
|
513
|
-
"nl": "Find all Python files in the current directory and subdirectories",
|
|
514
|
-
"correct_command": "find . -name '*.py' -type f",
|
|
515
|
-
"incorrect_command": "find *.py", # Wrong syntax
|
|
516
|
-
"category": "file_search",
|
|
517
|
-
},
|
|
518
|
-
{
|
|
519
|
-
"nl": "Count the number of lines in all text files in the current directory",
|
|
520
|
-
"correct_command": "wc -l *.txt | tail -1",
|
|
521
|
-
"incorrect_command": "count lines *.txt", # Not a real command
|
|
522
|
-
"category": "file_analysis",
|
|
523
|
-
},
|
|
524
|
-
{
|
|
525
|
-
"nl": "Create a compressed archive of the logs directory",
|
|
526
|
-
"correct_command": "tar -czvf logs.tar.gz logs/",
|
|
527
|
-
"incorrect_command": "zip logs/ archive", # Wrong argument order
|
|
528
|
-
"category": "archiving",
|
|
529
|
-
},
|
|
530
|
-
{
|
|
531
|
-
"nl": "Show running processes sorted by memory usage",
|
|
532
|
-
"correct_command": "ps aux --sort=-%mem | head -20",
|
|
533
|
-
"incorrect_command": "ps memory", # Invalid syntax
|
|
534
|
-
"category": "process_management",
|
|
535
|
-
},
|
|
536
|
-
{
|
|
537
|
-
"nl": "Find and kill all processes named 'python'",
|
|
538
|
-
"correct_command": "pkill -f python",
|
|
539
|
-
"incorrect_command": "kill python", # kill needs PID, not name
|
|
540
|
-
"category": "process_management",
|
|
541
|
-
},
|
|
542
|
-
{
|
|
543
|
-
"nl": "Download a file from a URL and save it with a specific name",
|
|
544
|
-
"correct_command": "curl -o output.txt https://example.com/file.txt",
|
|
545
|
-
"incorrect_command": "download https://example.com/file.txt", # Not a command
|
|
546
|
-
"category": "networking",
|
|
547
|
-
},
|
|
548
|
-
{
|
|
549
|
-
"nl": "Find files modified in the last 24 hours",
|
|
550
|
-
"correct_command": "find . -mtime -1 -type f",
|
|
551
|
-
"incorrect_command": "find . modified 24h", # Wrong syntax
|
|
552
|
-
"category": "file_search",
|
|
553
|
-
},
|
|
554
|
-
{
|
|
555
|
-
"nl": "Replace all occurrences of 'foo' with 'bar' in a file in-place",
|
|
556
|
-
"correct_command": "sed -i 's/foo/bar/g' file.txt",
|
|
557
|
-
"incorrect_command": "replace foo bar file.txt", # Not a command
|
|
558
|
-
"category": "text_processing",
|
|
559
|
-
},
|
|
560
|
-
{
|
|
561
|
-
"nl": "Check disk space usage for all mounted filesystems",
|
|
562
|
-
"correct_command": "df -h",
|
|
563
|
-
"incorrect_command": "disk space", # Not a command
|
|
564
|
-
"category": "system_info",
|
|
565
|
-
},
|
|
566
|
-
{
|
|
567
|
-
"nl": "Create a new user named 'developer' with home directory",
|
|
568
|
-
"correct_command": "sudo useradd -m -s /bin/bash developer",
|
|
569
|
-
"incorrect_command": "create user developer", # Not a command
|
|
570
|
-
"category": "user_management",
|
|
571
|
-
},
|
|
572
|
-
]
|
|
573
|
-
|
|
574
|
-
result = []
|
|
575
|
-
for i in range(count):
|
|
576
|
-
example = examples[i % len(examples)].copy()
|
|
577
|
-
result.append(example)
|
|
578
|
-
|
|
579
|
-
return result
|
|
580
|
-
|
|
581
254
|
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
582
|
-
"""Convert a single doc into a ContrastivePair.
|
|
255
|
+
"""Convert a single doc into a ContrastivePair.
|
|
256
|
+
|
|
257
|
+
nl2bash schema:
|
|
258
|
+
- nl: str (natural language description)
|
|
259
|
+
- bash: str (correct bash command)
|
|
260
|
+
"""
|
|
583
261
|
try:
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
correct = doc.get("correct_command", doc.get("bash", "")).strip()
|
|
587
|
-
incorrect = doc.get("incorrect_command", "").strip()
|
|
588
|
-
category = doc.get("category", "general")
|
|
262
|
+
nl = doc.get("nl", "").strip()
|
|
263
|
+
correct = doc.get("bash", "").strip()
|
|
589
264
|
|
|
590
|
-
if not nl:
|
|
265
|
+
if not nl or not correct:
|
|
591
266
|
return None
|
|
592
267
|
|
|
593
|
-
|
|
594
|
-
return None
|
|
595
|
-
|
|
596
|
-
if not incorrect:
|
|
597
|
-
incorrect = self._create_incorrect_command(nl)
|
|
598
|
-
|
|
599
|
-
task_prompt = f"""Terminal Command Task:
|
|
268
|
+
task_prompt = f"""Bash Command Task:
|
|
600
269
|
|
|
601
270
|
{nl}
|
|
602
271
|
|
|
603
|
-
Provide the correct
|
|
604
|
-
The command should be safe, efficient, and follow best practices."""
|
|
272
|
+
Provide the correct bash command to accomplish this task."""
|
|
605
273
|
|
|
606
|
-
|
|
607
|
-
|
|
274
|
+
# Create incorrect by corrupting the command
|
|
275
|
+
incorrect = self._create_incorrect_command(correct)
|
|
276
|
+
|
|
277
|
+
correct_response = f"```bash\n{correct}\n```"
|
|
278
|
+
incorrect_response = f"```bash\n{incorrect}\n```"
|
|
608
279
|
|
|
609
280
|
metadata = {
|
|
610
|
-
"label": "
|
|
611
|
-
"source": "
|
|
612
|
-
"
|
|
613
|
-
"os_type": self.os_type,
|
|
614
|
-
"is_terminal_benchmark": True,
|
|
281
|
+
"label": "nl2bash",
|
|
282
|
+
"source": "jiacheng-ye/nl2bash",
|
|
283
|
+
"is_bash_benchmark": True,
|
|
615
284
|
}
|
|
616
285
|
|
|
617
286
|
return self._build_pair(
|
|
@@ -625,9 +294,13 @@ The command should be safe, efficient, and follow best practices."""
|
|
|
625
294
|
log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
|
|
626
295
|
return None
|
|
627
296
|
|
|
628
|
-
def _create_incorrect_command(self,
|
|
629
|
-
"""Create a plausible but incorrect command."""
|
|
630
|
-
|
|
297
|
+
def _create_incorrect_command(self, correct: str) -> str:
|
|
298
|
+
"""Create a plausible but incorrect command by corrupting the correct one."""
|
|
299
|
+
# Remove a flag or part of the command
|
|
300
|
+
parts = correct.split()
|
|
301
|
+
if len(parts) > 2:
|
|
302
|
+
return " ".join(parts[:-1]) # Remove last part
|
|
303
|
+
return correct + " --invalid-flag"
|
|
631
304
|
|
|
632
305
|
|
|
633
306
|
|
|
@@ -635,11 +308,12 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
635
308
|
"""
|
|
636
309
|
Extractor for SciCode - scientific computing code generation benchmark.
|
|
637
310
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
scientific computations.
|
|
311
|
+
GitHub: https://scicode-bench.github.io/
|
|
312
|
+
Paper: "SciCode: A Research Coding Benchmark Curated by Scientists"
|
|
641
313
|
|
|
642
|
-
|
|
314
|
+
SciCode evaluates LLMs' ability to generate code for scientific computing
|
|
315
|
+
tasks across Physics, Math, Material Science, Biology, and Chemistry.
|
|
316
|
+
Contains 338 subproblems from 80 main challenges.
|
|
643
317
|
|
|
644
318
|
For scientific computing evaluation:
|
|
645
319
|
- Positive (correct) = Scientifically accurate code with proper numerical methods
|
|
@@ -666,6 +340,8 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
666
340
|
"""
|
|
667
341
|
Build contrastive pairs from SciCode examples.
|
|
668
342
|
|
|
343
|
+
Loads data from GitHub ZIP archive.
|
|
344
|
+
|
|
669
345
|
Args:
|
|
670
346
|
limit: Optional maximum number of pairs to produce.
|
|
671
347
|
|
|
@@ -673,16 +349,21 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
673
349
|
A list of ContrastivePair objects.
|
|
674
350
|
"""
|
|
675
351
|
max_items = self._normalize_limit(limit)
|
|
352
|
+
pairs: list[ContrastivePair] = []
|
|
676
353
|
|
|
677
|
-
|
|
678
|
-
|
|
354
|
+
docs = self._load_from_github()
|
|
355
|
+
|
|
356
|
+
if not docs:
|
|
357
|
+
log.error("Failed to load SciCode data from GitHub")
|
|
358
|
+
return []
|
|
679
359
|
|
|
680
|
-
|
|
360
|
+
log.info(f"Loaded {len(docs)} problems from SciCode GitHub")
|
|
681
361
|
|
|
682
362
|
for doc in docs:
|
|
683
|
-
|
|
363
|
+
# Filter by domain if specified
|
|
364
|
+
if self.domain and doc.get("domain", "").lower() != self.domain.lower():
|
|
684
365
|
continue
|
|
685
|
-
|
|
366
|
+
|
|
686
367
|
pair = self._extract_pair_from_doc(doc)
|
|
687
368
|
if pair is not None:
|
|
688
369
|
pairs.append(pair)
|
|
@@ -694,125 +375,55 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
694
375
|
|
|
695
376
|
return pairs
|
|
696
377
|
|
|
697
|
-
def
|
|
698
|
-
"""
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
return integral * h / 3
|
|
723
|
-
|
|
724
|
-
# Example: Integrate sin(x) from 0 to pi (expected: 2.0)
|
|
725
|
-
result = simpsons_rule(np.sin, 0, np.pi, 100)
|
|
726
|
-
print(f"Integral of sin(x) from 0 to pi: {result:.10f}")""",
|
|
727
|
-
"incorrect_solution": """import numpy as np
|
|
728
|
-
|
|
729
|
-
def simpsons_rule(f, a, b, n):
|
|
730
|
-
h = (b - a) / n
|
|
731
|
-
x = np.linspace(a, b, n) # Bug: should be n+1 points
|
|
732
|
-
y = f(x)
|
|
733
|
-
|
|
734
|
-
# Wrong implementation - missing proper weighting
|
|
735
|
-
integral = np.sum(y) * h # This is just rectangular rule
|
|
736
|
-
|
|
737
|
-
return integral""",
|
|
738
|
-
},
|
|
739
|
-
{
|
|
740
|
-
"problem": "Solve a system of ODEs using Runge-Kutta 4th order method",
|
|
741
|
-
"domain": "physics",
|
|
742
|
-
"correct_solution": """import numpy as np
|
|
743
|
-
|
|
744
|
-
def rk4_step(f, t, y, h):
|
|
745
|
-
'''
|
|
746
|
-
Single step of RK4 method.
|
|
747
|
-
f: function f(t, y) returning dy/dt
|
|
748
|
-
t: current time
|
|
749
|
-
y: current state vector
|
|
750
|
-
h: step size
|
|
751
|
-
'''
|
|
752
|
-
k1 = h * f(t, y)
|
|
753
|
-
k2 = h * f(t + h/2, y + k1/2)
|
|
754
|
-
k3 = h * f(t + h/2, y + k2/2)
|
|
755
|
-
k4 = h * f(t + h, y + k3)
|
|
756
|
-
|
|
757
|
-
return y + (k1 + 2*k2 + 2*k3 + k4) / 6
|
|
758
|
-
|
|
759
|
-
def solve_ode(f, y0, t_span, n_steps):
|
|
760
|
-
'''
|
|
761
|
-
Solve ODE system dy/dt = f(t, y) using RK4.
|
|
762
|
-
'''
|
|
763
|
-
t = np.linspace(t_span[0], t_span[1], n_steps + 1)
|
|
764
|
-
h = t[1] - t[0]
|
|
765
|
-
|
|
766
|
-
y = np.zeros((n_steps + 1, len(y0)))
|
|
767
|
-
y[0] = y0
|
|
768
|
-
|
|
769
|
-
for i in range(n_steps):
|
|
770
|
-
y[i+1] = rk4_step(f, t[i], y[i], h)
|
|
771
|
-
|
|
772
|
-
return t, y
|
|
773
|
-
|
|
774
|
-
# Example: Simple harmonic oscillator
|
|
775
|
-
def harmonic(t, y):
|
|
776
|
-
return np.array([y[1], -y[0]])
|
|
777
|
-
|
|
778
|
-
t, y = solve_ode(harmonic, np.array([1.0, 0.0]), [0, 10], 1000)""",
|
|
779
|
-
"incorrect_solution": """import numpy as np
|
|
780
|
-
|
|
781
|
-
def euler_step(f, t, y, h):
|
|
782
|
-
# Using Euler method instead of RK4 - much less accurate
|
|
783
|
-
return y + h * f(t, y)
|
|
784
|
-
|
|
785
|
-
def solve_ode(f, y0, t_span, n_steps):
|
|
786
|
-
t = np.linspace(t_span[0], t_span[1], n_steps) # Bug: should be n_steps+1
|
|
787
|
-
h = (t_span[1] - t_span[0]) / n_steps
|
|
788
|
-
|
|
789
|
-
y = [y0]
|
|
790
|
-
for i in range(n_steps - 1):
|
|
791
|
-
y.append(euler_step(f, t[i], y[i], h))
|
|
792
|
-
|
|
793
|
-
return t, np.array(y)""",
|
|
794
|
-
},
|
|
795
|
-
]
|
|
796
|
-
|
|
797
|
-
result = []
|
|
798
|
-
for i in range(count):
|
|
799
|
-
example = examples[i % len(examples)].copy()
|
|
800
|
-
result.append(example)
|
|
801
|
-
|
|
802
|
-
return result
|
|
378
|
+
def _load_from_github(self) -> list[dict[str, Any]]:
|
|
379
|
+
"""Load SciCode data from GitHub ZIP archive."""
|
|
380
|
+
try:
|
|
381
|
+
response = requests.get(SCICODE_GITHUB_URL, timeout=60)
|
|
382
|
+
response.raise_for_status()
|
|
383
|
+
|
|
384
|
+
all_problems = []
|
|
385
|
+
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
|
|
386
|
+
for filename in zf.namelist():
|
|
387
|
+
if filename.endswith('.json'):
|
|
388
|
+
with zf.open(filename) as f:
|
|
389
|
+
try:
|
|
390
|
+
data = json.load(f)
|
|
391
|
+
if isinstance(data, list):
|
|
392
|
+
all_problems.extend(data)
|
|
393
|
+
elif isinstance(data, dict):
|
|
394
|
+
all_problems.append(data)
|
|
395
|
+
except json.JSONDecodeError:
|
|
396
|
+
continue
|
|
397
|
+
|
|
398
|
+
return all_problems
|
|
399
|
+
|
|
400
|
+
except Exception as e:
|
|
401
|
+
log.error(f"Failed to load SciCode from GitHub: {e}")
|
|
402
|
+
return []
|
|
803
403
|
|
|
804
404
|
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
805
|
-
"""Convert a single doc into a ContrastivePair.
|
|
405
|
+
"""Convert a single doc into a ContrastivePair.
|
|
406
|
+
|
|
407
|
+
SciCode schema varies by file, but typically includes:
|
|
408
|
+
- problem_id: str
|
|
409
|
+
- problem: str (description)
|
|
410
|
+
- sub_problems: list of subproblems
|
|
411
|
+
- domain: str (Physics, Math, etc.)
|
|
412
|
+
"""
|
|
806
413
|
try:
|
|
807
|
-
|
|
414
|
+
problem_id = doc.get("problem_id", "")
|
|
415
|
+
problem = doc.get("problem", doc.get("description", "")).strip()
|
|
808
416
|
domain = doc.get("domain", "general")
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
if not problem
|
|
417
|
+
sub_problems = doc.get("sub_problems", [])
|
|
418
|
+
|
|
419
|
+
# Try to get problem text from various fields
|
|
420
|
+
if not problem and sub_problems:
|
|
421
|
+
problem = sub_problems[0].get("problem", "") if sub_problems else ""
|
|
422
|
+
|
|
423
|
+
if not problem:
|
|
813
424
|
return None
|
|
814
425
|
|
|
815
|
-
task_prompt = f"""Scientific Computing Task:
|
|
426
|
+
task_prompt = f"""Scientific Computing Task ({domain}):
|
|
816
427
|
|
|
817
428
|
{problem}
|
|
818
429
|
|
|
@@ -821,12 +432,21 @@ Provide a Python implementation that is:
|
|
|
821
432
|
- Well-documented with clear variable names
|
|
822
433
|
- Efficient and follows scientific computing best practices"""
|
|
823
434
|
|
|
435
|
+
# Create correct response placeholder (actual solution from benchmark)
|
|
436
|
+
correct = doc.get("solution", doc.get("code", "# Correct solution would go here"))
|
|
437
|
+
if isinstance(correct, list):
|
|
438
|
+
correct = correct[0] if correct else "# Solution"
|
|
439
|
+
|
|
440
|
+
# Create incorrect by corrupting
|
|
441
|
+
incorrect = "# Incorrect implementation with numerical errors\nimport numpy as np\nresult = 0 # Wrong approach"
|
|
442
|
+
|
|
824
443
|
correct_response = f"```python\n{correct}\n```"
|
|
825
444
|
incorrect_response = f"```python\n{incorrect}\n```"
|
|
826
445
|
|
|
827
446
|
metadata = {
|
|
828
447
|
"label": "scicode",
|
|
829
|
-
"source": "scicode",
|
|
448
|
+
"source": "scicode-bench/SciCode",
|
|
449
|
+
"problem_id": problem_id,
|
|
830
450
|
"domain": domain,
|
|
831
451
|
"is_scientific_computing_benchmark": True,
|
|
832
452
|
}
|
|
@@ -839,6 +459,6 @@ Provide a Python implementation that is:
|
|
|
839
459
|
)
|
|
840
460
|
|
|
841
461
|
except Exception as exc:
|
|
842
|
-
log.error(f"Error extracting pair
|
|
462
|
+
log.error(f"Error extracting SciCode pair: {exc}", exc_info=True)
|
|
843
463
|
return None
|
|
844
464
|
|