wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +9 -5
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +256 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +5 -7
- wisent/core/cli/agent/train_classifier.py +19 -7
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import requests
|
|
3
4
|
from typing import Any
|
|
4
5
|
from wisent.core.cli_logger import setup_logger
|
|
5
6
|
|
|
@@ -10,45 +11,32 @@ __all__ = ["AiderPolyglotExtractor"]
|
|
|
10
11
|
|
|
11
12
|
log = setup_logger(__name__)
|
|
12
13
|
|
|
14
|
+
# GitHub API base URL for Aider Polyglot benchmark
|
|
15
|
+
AIDER_GITHUB_API = "https://api.github.com/repos/Aider-AI/polyglot-benchmark/contents"
|
|
16
|
+
|
|
13
17
|
# Languages supported by Aider Polyglot benchmark
|
|
14
|
-
AIDER_POLYGLOT_LANGUAGES = [
|
|
15
|
-
"python",
|
|
16
|
-
"javascript",
|
|
17
|
-
"java",
|
|
18
|
-
"cpp",
|
|
19
|
-
"go",
|
|
20
|
-
"rust",
|
|
21
|
-
]
|
|
18
|
+
AIDER_POLYGLOT_LANGUAGES = ["python", "javascript", "java", "cpp", "go", "rust"]
|
|
22
19
|
|
|
23
20
|
|
|
24
21
|
class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
|
|
25
22
|
"""
|
|
26
|
-
Extractor for Aider Polyglot
|
|
23
|
+
Extractor for Aider Polyglot benchmark.
|
|
27
24
|
|
|
25
|
+
GitHub: https://github.com/Aider-AI/polyglot-benchmark
|
|
26
|
+
|
|
28
27
|
Aider's polyglot benchmark tests LLMs on 225 challenging Exercism coding
|
|
29
|
-
exercises across C++, Go, Java, JavaScript, Python, and Rust.
|
|
30
|
-
uses the jinaai/code_exercises dataset which provides similar code exercise
|
|
31
|
-
problems in Python.
|
|
28
|
+
exercises across C++, Go, Java, JavaScript, Python, and Rust.
|
|
32
29
|
|
|
33
|
-
|
|
34
|
-
-
|
|
35
|
-
-
|
|
36
|
-
-
|
|
30
|
+
Structure per exercise:
|
|
31
|
+
- .docs/instructions.md - problem description
|
|
32
|
+
- .meta/example.py - reference solution
|
|
33
|
+
- {name}_test.py - test cases
|
|
37
34
|
|
|
38
35
|
For code editing:
|
|
39
|
-
- Positive (correct) = Working solution
|
|
36
|
+
- Positive (correct) = Working solution from .meta/example.py
|
|
40
37
|
- Negative (incorrect) = Buggy or incomplete solution
|
|
41
|
-
|
|
42
|
-
Schema (jinaai/code_exercises):
|
|
43
|
-
- problem: str (function signature with docstring)
|
|
44
|
-
- solution: str (complete solution implementation)
|
|
45
|
-
|
|
46
|
-
Note: The original Aider Polyglot benchmark is hosted on GitHub at
|
|
47
|
-
github.com/Aider-AI/polyglot-benchmark. This extractor uses HuggingFace
|
|
48
|
-
alternatives with similar structure.
|
|
49
38
|
"""
|
|
50
39
|
|
|
51
|
-
# Evaluator that should be used for this benchmark
|
|
52
40
|
evaluator_name = "code_editing"
|
|
53
41
|
|
|
54
42
|
def __init__(self, language: str = "python"):
|
|
@@ -56,9 +44,11 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
56
44
|
Initialize Aider Polyglot extractor.
|
|
57
45
|
|
|
58
46
|
Args:
|
|
59
|
-
language: Target programming language (
|
|
47
|
+
language: Target programming language (python, javascript, java, cpp, go, rust)
|
|
60
48
|
"""
|
|
61
49
|
super().__init__()
|
|
50
|
+
if language not in AIDER_POLYGLOT_LANGUAGES:
|
|
51
|
+
raise ValueError(f"Language must be one of {AIDER_POLYGLOT_LANGUAGES}")
|
|
62
52
|
self.language = language
|
|
63
53
|
|
|
64
54
|
def extract_contrastive_pairs(
|
|
@@ -66,48 +56,21 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
66
56
|
limit: int | None = None,
|
|
67
57
|
) -> list[ContrastivePair]:
|
|
68
58
|
"""
|
|
69
|
-
Build contrastive pairs from
|
|
70
|
-
|
|
71
|
-
For code editing:
|
|
72
|
-
- Positive (correct) = Working solution
|
|
73
|
-
- Negative (incorrect) = Buggy or incomplete solution
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
limit: Optional maximum number of pairs to produce.
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
A list of ContrastivePair objects.
|
|
59
|
+
Build contrastive pairs from Aider Polyglot GitHub repository.
|
|
80
60
|
"""
|
|
81
61
|
max_items = self._normalize_limit(limit)
|
|
62
|
+
pairs: list[ContrastivePair] = []
|
|
82
63
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
limit=max_items,
|
|
89
|
-
)
|
|
90
|
-
dataset_source = "jinaai/code_exercises"
|
|
91
|
-
log.info(f"Loaded {len(docs)} examples from {dataset_source}")
|
|
92
|
-
except Exception as e:
|
|
93
|
-
log.warning(f"Failed to load jinaai/code_exercises: {e}")
|
|
94
|
-
# Try alternative dataset
|
|
95
|
-
try:
|
|
96
|
-
docs = self.load_dataset(
|
|
97
|
-
dataset_name="synapse-alpha/coding_exercises",
|
|
98
|
-
split="train",
|
|
99
|
-
limit=max_items,
|
|
100
|
-
)
|
|
101
|
-
dataset_source = "synapse-alpha/coding_exercises"
|
|
102
|
-
log.info(f"Loaded {len(docs)} examples from {dataset_source}")
|
|
103
|
-
except Exception as e2:
|
|
104
|
-
log.error(f"Failed to load any code exercises dataset: {e2}")
|
|
105
|
-
return []
|
|
64
|
+
exercises = self._load_exercises_from_github()
|
|
65
|
+
|
|
66
|
+
if not exercises:
|
|
67
|
+
log.error("Failed to load exercises from Aider Polyglot GitHub")
|
|
68
|
+
return []
|
|
106
69
|
|
|
107
|
-
|
|
70
|
+
log.info(f"Loaded {len(exercises)} exercises from Aider Polyglot GitHub")
|
|
108
71
|
|
|
109
|
-
for
|
|
110
|
-
pair = self.
|
|
72
|
+
for exercise in exercises:
|
|
73
|
+
pair = self._extract_pair_from_exercise(exercise)
|
|
111
74
|
if pair is not None:
|
|
112
75
|
pairs.append(pair)
|
|
113
76
|
if max_items is not None and len(pairs) >= max_items:
|
|
@@ -118,36 +81,94 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
118
81
|
|
|
119
82
|
return pairs
|
|
120
83
|
|
|
121
|
-
def
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
84
|
+
def _load_exercises_from_github(self) -> list[dict[str, Any]]:
|
|
85
|
+
"""Load exercises from Aider Polyglot GitHub repository."""
|
|
86
|
+
try:
|
|
87
|
+
# Get list of exercises
|
|
88
|
+
exercises_url = f"{AIDER_GITHUB_API}/{self.language}/exercises/practice"
|
|
89
|
+
response = requests.get(exercises_url, timeout=30)
|
|
90
|
+
response.raise_for_status()
|
|
91
|
+
|
|
92
|
+
exercise_dirs = response.json()
|
|
93
|
+
exercises = []
|
|
94
|
+
|
|
95
|
+
for exercise_dir in exercise_dirs:
|
|
96
|
+
if exercise_dir.get("type") != "dir":
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
exercise_name = exercise_dir.get("name", "")
|
|
100
|
+
exercise_path = exercise_dir.get("path", "")
|
|
101
|
+
|
|
102
|
+
# Load instructions and solution
|
|
103
|
+
exercise_data = self._load_exercise_data(exercise_name, exercise_path)
|
|
104
|
+
if exercise_data:
|
|
105
|
+
exercises.append(exercise_data)
|
|
106
|
+
|
|
107
|
+
return exercises
|
|
108
|
+
|
|
109
|
+
except Exception as e:
|
|
110
|
+
log.error(f"Failed to load exercises from GitHub: {e}")
|
|
111
|
+
return []
|
|
128
112
|
|
|
129
|
-
|
|
130
|
-
"""
|
|
113
|
+
def _load_exercise_data(self, name: str, path: str) -> dict[str, Any] | None:
|
|
114
|
+
"""Load a single exercise's instructions and solution."""
|
|
131
115
|
try:
|
|
132
|
-
|
|
133
|
-
|
|
116
|
+
base_url = "https://raw.githubusercontent.com/Aider-AI/polyglot-benchmark/main"
|
|
117
|
+
|
|
118
|
+
# Load instructions
|
|
119
|
+
instructions_url = f"{base_url}/{path}/.docs/instructions.md"
|
|
120
|
+
instructions_resp = requests.get(instructions_url, timeout=15)
|
|
121
|
+
if instructions_resp.status_code != 200:
|
|
122
|
+
return None
|
|
123
|
+
instructions = instructions_resp.text
|
|
124
|
+
|
|
125
|
+
# Load solution - file extension depends on language
|
|
126
|
+
ext_map = {
|
|
127
|
+
"python": "py", "javascript": "js", "java": "java",
|
|
128
|
+
"cpp": "cpp", "go": "go", "rust": "rs"
|
|
129
|
+
}
|
|
130
|
+
ext = ext_map.get(self.language, "py")
|
|
131
|
+
|
|
132
|
+
solution_url = f"{base_url}/{path}/.meta/example.{ext}"
|
|
133
|
+
solution_resp = requests.get(solution_url, timeout=15)
|
|
134
|
+
if solution_resp.status_code != 200:
|
|
135
|
+
return None
|
|
136
|
+
solution = solution_resp.text
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
"name": name,
|
|
140
|
+
"instructions": instructions,
|
|
141
|
+
"solution": solution,
|
|
142
|
+
"path": path,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
except Exception as e:
|
|
146
|
+
log.debug(f"Failed to load exercise {name}: {e}")
|
|
147
|
+
return None
|
|
134
148
|
|
|
135
|
-
|
|
136
|
-
|
|
149
|
+
def _extract_pair_from_exercise(self, exercise: dict[str, Any]) -> ContrastivePair | None:
|
|
150
|
+
"""Convert an exercise into a ContrastivePair."""
|
|
151
|
+
try:
|
|
152
|
+
name = exercise.get("name", "")
|
|
153
|
+
instructions = exercise.get("instructions", "").strip()
|
|
154
|
+
solution = exercise.get("solution", "").strip()
|
|
155
|
+
|
|
156
|
+
if not instructions or not solution:
|
|
137
157
|
return None
|
|
138
158
|
|
|
139
|
-
|
|
140
|
-
prompt = self._build_prompt(problem)
|
|
159
|
+
prompt = f"""Coding Exercise: {name.replace('-', ' ').title()}
|
|
141
160
|
|
|
142
|
-
|
|
143
|
-
|
|
161
|
+
{instructions}
|
|
162
|
+
|
|
163
|
+
Please provide the complete implementation."""
|
|
144
164
|
|
|
145
|
-
|
|
146
|
-
incorrect_response = self._create_incorrect_response(
|
|
165
|
+
correct_response = f"```{self.language}\n{solution}\n```"
|
|
166
|
+
incorrect_response = self._create_incorrect_response(solution)
|
|
147
167
|
|
|
148
168
|
metadata = {
|
|
149
169
|
"label": "aider_polyglot",
|
|
150
|
-
"source":
|
|
170
|
+
"source": "Aider-AI/polyglot-benchmark",
|
|
171
|
+
"exercise_name": name,
|
|
151
172
|
"language": self.language,
|
|
152
173
|
"is_code_benchmark": True,
|
|
153
174
|
"is_code_editing_benchmark": True,
|
|
@@ -161,65 +182,21 @@ class AiderPolyglotExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
161
182
|
)
|
|
162
183
|
|
|
163
184
|
except Exception as exc:
|
|
164
|
-
log.error(f"Error extracting pair
|
|
185
|
+
log.error(f"Error extracting pair: {exc}", exc_info=True)
|
|
165
186
|
return None
|
|
166
187
|
|
|
167
|
-
def
|
|
168
|
-
"""Build the code editing prompt."""
|
|
169
|
-
return f"""Complete the following Python function based on its docstring.
|
|
170
|
-
|
|
171
|
-
{problem}
|
|
172
|
-
|
|
173
|
-
Please provide the complete implementation."""
|
|
174
|
-
|
|
175
|
-
def _create_correct_response(self, solution: str) -> str:
|
|
176
|
-
"""Create the correct response with working solution."""
|
|
177
|
-
return f"""Here is the complete implementation:
|
|
178
|
-
|
|
179
|
-
```python
|
|
180
|
-
{solution}
|
|
181
|
-
```
|
|
182
|
-
|
|
183
|
-
This solution correctly implements the function according to the docstring specification."""
|
|
184
|
-
|
|
185
|
-
def _create_incorrect_response(self, problem: str, solution: str) -> str:
|
|
188
|
+
def _create_incorrect_response(self, solution: str) -> str:
|
|
186
189
|
"""Create an incorrect response with common bugs."""
|
|
187
|
-
# Extract function name from problem if possible
|
|
188
|
-
func_name = "the function"
|
|
189
|
-
if "def " in problem:
|
|
190
|
-
try:
|
|
191
|
-
func_part = problem.split("def ")[1]
|
|
192
|
-
func_name = func_part.split("(")[0]
|
|
193
|
-
except (IndexError, AttributeError):
|
|
194
|
-
pass
|
|
195
|
-
|
|
196
|
-
# Create a buggy version by introducing common errors
|
|
197
|
-
buggy_solution = self._introduce_bugs(solution)
|
|
198
|
-
|
|
199
|
-
return f"""Here is my implementation:
|
|
200
|
-
|
|
201
|
-
```python
|
|
202
|
-
{buggy_solution}
|
|
203
|
-
```
|
|
204
|
-
|
|
205
|
-
Note: This implementation may have issues:
|
|
206
|
-
- Missing edge case handling
|
|
207
|
-
- Potential off-by-one errors
|
|
208
|
-
- Incomplete logic"""
|
|
209
|
-
|
|
210
|
-
def _introduce_bugs(self, solution: str) -> str:
|
|
211
|
-
"""Introduce common bugs into a solution."""
|
|
212
190
|
lines = solution.split("\n")
|
|
213
191
|
|
|
214
192
|
if len(lines) > 3:
|
|
215
|
-
# Remove a line to create incomplete logic
|
|
216
193
|
middle_idx = len(lines) // 2
|
|
217
|
-
buggy_lines = lines[:middle_idx] + [" pass # TODO:
|
|
218
|
-
|
|
194
|
+
buggy_lines = lines[:middle_idx] + [" pass # TODO: incomplete"] + lines[middle_idx+2:]
|
|
195
|
+
buggy = "\n".join(buggy_lines)
|
|
219
196
|
elif lines:
|
|
220
|
-
|
|
221
|
-
first_line = lines[0] if lines else "def func():"
|
|
222
|
-
return f"{first_line}\n pass # Implementation incomplete"
|
|
197
|
+
buggy = f"{lines[0]}\n pass # Implementation incomplete"
|
|
223
198
|
else:
|
|
224
|
-
|
|
199
|
+
buggy = "pass # No implementation"
|
|
200
|
+
|
|
201
|
+
return f"```{self.language}\n{buggy}\n```"
|
|
225
202
|
|
|
@@ -95,18 +95,8 @@ class CodeforcesExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
95
95
|
)
|
|
96
96
|
log.info(f"Loaded {len(docs)} problems from Codeforces ({self.config})")
|
|
97
97
|
except Exception as e:
|
|
98
|
-
log.
|
|
99
|
-
|
|
100
|
-
try:
|
|
101
|
-
docs = self.load_dataset(
|
|
102
|
-
dataset_name="deepmind/code_contests",
|
|
103
|
-
split="train",
|
|
104
|
-
limit=max_items * 2 if max_items else None,
|
|
105
|
-
)
|
|
106
|
-
log.info(f"Loaded {len(docs)} problems from deepmind/code_contests")
|
|
107
|
-
except Exception as e2:
|
|
108
|
-
log.error(f"Failed to load any Codeforces dataset: {e2}")
|
|
109
|
-
return []
|
|
98
|
+
log.error(f"Failed to load open-r1/codeforces: {e}")
|
|
99
|
+
return []
|
|
110
100
|
|
|
111
101
|
pairs: list[ContrastivePair] = []
|
|
112
102
|
|