wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +9 -5
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +256 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/apply_steering.py +5 -7
- wisent/core/cli/agent/train_classifier.py +19 -7
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/generate_paper_data.py +384 -0
- wisent/examples/scripts/intervention_validation.py +626 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/examples/scripts/threshold_analysis.py +434 -0
- wisent/examples/scripts/visualization_gallery.py +582 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
wisent/core/utils/device.py
CHANGED
|
@@ -95,23 +95,19 @@ def preferred_dtype(kind: DeviceKind | None = None) -> torch.dtype:
|
|
|
95
95
|
"""
|
|
96
96
|
Return the preferred dtype for model loading.
|
|
97
97
|
|
|
98
|
-
Default is
|
|
99
|
-
vectors trained on one device work identically on another.
|
|
98
|
+
Default is device-optimized dtype (bfloat16 on CUDA, float16 on MPS, float32 on CPU).
|
|
100
99
|
|
|
101
100
|
Priority:
|
|
102
101
|
1. Global override set via set_default_dtype()
|
|
103
102
|
2. WISENT_DTYPE environment variable ("float32", "float16", "bfloat16", "auto")
|
|
104
|
-
3. Default:
|
|
103
|
+
3. Default: device-optimized (bfloat16 on CUDA, float16 on MPS, float32 on CPU)
|
|
105
104
|
|
|
106
|
-
To use device-optimized dtypes for better performance (at cost of cross-device
|
|
107
|
-
consistency), set WISENT_DTYPE=auto or call set_default_dtype("auto").
|
|
108
|
-
|
|
109
105
|
Example:
|
|
110
|
-
>>> preferred_dtype() #
|
|
111
|
-
torch.float32
|
|
112
|
-
>>> set_default_dtype("auto") # Use device-optimized dtypes
|
|
113
|
-
>>> preferred_dtype() # Now bfloat16 on CUDA, float16 on MPS
|
|
106
|
+
>>> preferred_dtype() # bfloat16 on CUDA, float16 on MPS, float32 on CPU
|
|
114
107
|
torch.bfloat16
|
|
108
|
+
>>> set_default_dtype("float32") # Force float32 everywhere
|
|
109
|
+
>>> preferred_dtype()
|
|
110
|
+
torch.float32
|
|
115
111
|
"""
|
|
116
112
|
# Check global override first
|
|
117
113
|
if _global_dtype_override is not None:
|
|
@@ -126,8 +122,8 @@ def preferred_dtype(kind: DeviceKind | None = None) -> torch.dtype:
|
|
|
126
122
|
return device_optimized_dtype(kind)
|
|
127
123
|
return env_dtype
|
|
128
124
|
|
|
129
|
-
# Default:
|
|
130
|
-
return
|
|
125
|
+
# Default: use device-optimized dtype for best performance
|
|
126
|
+
return device_optimized_dtype(kind)
|
|
131
127
|
|
|
132
128
|
|
|
133
129
|
def device_optimized_dtype(kind: DeviceKind | None = None) -> torch.dtype:
|
|
@@ -159,8 +155,14 @@ def device_optimized_dtype(kind: DeviceKind | None = None) -> torch.dtype:
|
|
|
159
155
|
# Steering Vector dtype utilities
|
|
160
156
|
# ============================================================================
|
|
161
157
|
|
|
162
|
-
|
|
163
|
-
|
|
158
|
+
|
|
159
|
+
def steering_vector_dtype() -> torch.dtype:
|
|
160
|
+
"""Return the dtype for steering vectors (uses preferred_dtype())."""
|
|
161
|
+
return preferred_dtype()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# Legacy constant for backward compatibility - use steering_vector_dtype() instead
|
|
165
|
+
STEERING_VECTOR_DTYPE = torch.float32 # Deprecated: kept for backward compat only
|
|
164
166
|
|
|
165
167
|
|
|
166
168
|
def save_steering_vector(
|
|
@@ -172,10 +174,7 @@ def save_steering_vector(
|
|
|
172
174
|
metadata: dict | None = None,
|
|
173
175
|
) -> None:
|
|
174
176
|
"""
|
|
175
|
-
Save a steering vector with dtype metadata
|
|
176
|
-
|
|
177
|
-
Vectors are always stored in float32 for consistency across devices,
|
|
178
|
-
but the original dtype is preserved in metadata for reference.
|
|
177
|
+
Save a steering vector with dtype metadata.
|
|
179
178
|
|
|
180
179
|
Args:
|
|
181
180
|
path: File path to save to (.pt)
|
|
@@ -187,21 +186,22 @@ def save_steering_vector(
|
|
|
187
186
|
"""
|
|
188
187
|
# Store original dtype before conversion
|
|
189
188
|
original_dtype = vector.dtype
|
|
189
|
+
storage_dtype = steering_vector_dtype()
|
|
190
190
|
|
|
191
|
-
#
|
|
192
|
-
|
|
191
|
+
# Store in preferred dtype
|
|
192
|
+
vector_stored = vector.to(dtype=storage_dtype, device="cpu")
|
|
193
193
|
|
|
194
194
|
save_data = {
|
|
195
195
|
# Primary data
|
|
196
|
-
"steering_vector":
|
|
196
|
+
"steering_vector": vector_stored,
|
|
197
197
|
"layer": layer,
|
|
198
198
|
"model": model_name,
|
|
199
199
|
"method": method,
|
|
200
200
|
# Dtype metadata
|
|
201
201
|
"original_dtype": str(original_dtype),
|
|
202
|
-
"storage_dtype": str(
|
|
202
|
+
"storage_dtype": str(storage_dtype),
|
|
203
203
|
# Legacy keys for backward compatibility
|
|
204
|
-
"vector":
|
|
204
|
+
"vector": vector_stored,
|
|
205
205
|
"layer_index": layer,
|
|
206
206
|
}
|
|
207
207
|
|
|
@@ -233,7 +233,8 @@ def load_steering_vector(
|
|
|
233
233
|
- "original_dtype": Original dtype when saved
|
|
234
234
|
- "metadata": Any additional metadata
|
|
235
235
|
"""
|
|
236
|
-
|
|
236
|
+
target_device = device or resolve_default_device()
|
|
237
|
+
data = torch.load(path, map_location=target_device, weights_only=False)
|
|
237
238
|
|
|
238
239
|
# Get the vector (support both old and new key names)
|
|
239
240
|
vector = data.get("steering_vector") or data.get("vector")
|
|
@@ -241,10 +242,9 @@ def load_steering_vector(
|
|
|
241
242
|
raise FileLoadError(file_path=str(path), reason="No steering vector found")
|
|
242
243
|
|
|
243
244
|
# Determine target dtype
|
|
244
|
-
target_dtype = dtype or preferred_dtype(
|
|
245
|
-
target_device = device or resolve_default_device()
|
|
245
|
+
target_dtype = dtype or preferred_dtype(target_device)
|
|
246
246
|
|
|
247
|
-
# Convert to target dtype/device
|
|
247
|
+
# Convert to target dtype/device if needed
|
|
248
248
|
vector = vector.to(dtype=target_dtype, device=target_device)
|
|
249
249
|
|
|
250
250
|
return {
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Generate all layer combinations for hyperparameter search."""
|
|
2
|
+
|
|
3
|
+
from itertools import combinations
|
|
4
|
+
from math import comb
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_layer_combinations(num_layers: int, max_combo_size: int, single_and_all_only: bool = True) -> List[List[int]]:
|
|
9
|
+
"""
|
|
10
|
+
Generate layer combinations up to a maximum combination size.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
num_layers: Total number of layers in the model
|
|
14
|
+
max_combo_size: Maximum number of layers in a combination (e.g., 3)
|
|
15
|
+
single_and_all_only: If True, only return single layers and all layers together
|
|
16
|
+
(skip 2-layer, 3-layer combinations). Default: True
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
List of layer combinations:
|
|
20
|
+
- All layers together: [0, 1, 2, ..., num_layers-1]
|
|
21
|
+
- All individual layers: [0], [1], ..., [num_layers-1]
|
|
22
|
+
- (if not single_and_all_only) All combinations of 2, 3, ..., max_combo_size layers
|
|
23
|
+
"""
|
|
24
|
+
all_layers = list(range(num_layers))
|
|
25
|
+
result = []
|
|
26
|
+
|
|
27
|
+
# All layers together (always included)
|
|
28
|
+
result.append(all_layers)
|
|
29
|
+
|
|
30
|
+
# All individual layers
|
|
31
|
+
for layer in all_layers:
|
|
32
|
+
result.append([layer])
|
|
33
|
+
|
|
34
|
+
# All combinations of 2, 3, ..., max_combo_size layers (unless single_and_all_only)
|
|
35
|
+
if not single_and_all_only:
|
|
36
|
+
for r in range(2, max_combo_size + 1):
|
|
37
|
+
for combo in combinations(all_layers, r):
|
|
38
|
+
result.append(list(combo))
|
|
39
|
+
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_layer_combinations_count(num_layers: int, max_combo_size: int) -> int:
|
|
44
|
+
"""
|
|
45
|
+
Calculate total number of layer combinations without generating them.
|
|
46
|
+
|
|
47
|
+
Total = 1 (all layers) + C(n,1) + C(n,2) + ... + C(n, max_combo_size)
|
|
48
|
+
"""
|
|
49
|
+
total = 1 # all layers
|
|
50
|
+
for r in range(1, max_combo_size + 1):
|
|
51
|
+
total += comb(num_layers, r)
|
|
52
|
+
return total
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
# Test with 16 layers (like Llama-3.2-1B) and max_combo_size=3
|
|
57
|
+
num_layers = 16
|
|
58
|
+
max_combo_size = 3
|
|
59
|
+
combos = get_layer_combinations(num_layers, max_combo_size)
|
|
60
|
+
|
|
61
|
+
print(f"Model with {num_layers} layers, max_combo_size={max_combo_size}:")
|
|
62
|
+
print(f"Total combinations: {len(combos)}")
|
|
63
|
+
print(f"Expected: {get_layer_combinations_count(num_layers, max_combo_size)}")
|
|
64
|
+
print()
|
|
65
|
+
|
|
66
|
+
print("First 20 combinations:")
|
|
67
|
+
for i, combo in enumerate(combos[:20]):
|
|
68
|
+
print(f" {i+1}: {combo}")
|
|
69
|
+
if len(combos) > 20:
|
|
70
|
+
print(f" ... and {len(combos) - 20} more")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Wisent examples
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Scripts for wisent examples
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Count maximum contrastive pairs for all benchmarks."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
# Set environment variables
|
|
10
|
+
os.environ['HF_DATASETS_TRUST_REMOTE_CODE'] = '1'
|
|
11
|
+
os.environ['HF_ALLOW_CODE_EVAL'] = '1'
|
|
12
|
+
|
|
13
|
+
# Add wisent to path
|
|
14
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
|
15
|
+
|
|
16
|
+
from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
|
|
17
|
+
from wisent.core.data_loaders.loaders.huggingface_loader import HuggingFaceDataLoader
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_benchmarks():
|
|
21
|
+
"""Load benchmarks from central registry."""
|
|
22
|
+
from wisent.core.benchmark_registry import get_all_benchmarks
|
|
23
|
+
return get_all_benchmarks()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def count_pairs(task_name: str) -> int:
|
|
27
|
+
"""Count maximum contrastive pairs for a task."""
|
|
28
|
+
try:
|
|
29
|
+
# Determine loader type
|
|
30
|
+
hf_tasks = [
|
|
31
|
+
"math", "math_500", "aime", "hmmt", "polymath", "livemathbench",
|
|
32
|
+
"humaneval", "humaneval_plus", "mbpp", "mbpp_plus",
|
|
33
|
+
"instruct_humaneval", "apps", "conala", "concode",
|
|
34
|
+
"ds", "ds1000", "ds_1000", "mercury", "recode",
|
|
35
|
+
"multipl", "multiple_", "multipl_e",
|
|
36
|
+
"codexglue", "livecodebench",
|
|
37
|
+
"super_gpqa", "supergpqa", "hle",
|
|
38
|
+
"tag",
|
|
39
|
+
"meddialog",
|
|
40
|
+
"mmlusr"
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
lm_eval_only_tasks = [
|
|
44
|
+
"minerva_math", "code_x_glue", "humaneval_infilling", "mathqa"
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
if any(task_name.lower() == t or task_name.lower().startswith(t + "_") for t in lm_eval_only_tasks):
|
|
48
|
+
loader = LMEvalDataLoader()
|
|
49
|
+
elif any(task_name.lower().startswith(t) for t in hf_tasks):
|
|
50
|
+
loader = HuggingFaceDataLoader()
|
|
51
|
+
else:
|
|
52
|
+
loader = LMEvalDataLoader()
|
|
53
|
+
|
|
54
|
+
# Load with no limit to get full count
|
|
55
|
+
result = loader._load_one_task(
|
|
56
|
+
task_name=task_name,
|
|
57
|
+
split_ratio=0.8,
|
|
58
|
+
seed=42,
|
|
59
|
+
limit=None, # No limit
|
|
60
|
+
training_limit=None,
|
|
61
|
+
testing_limit=None
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
train_pairs = len(result['train_qa_pairs'].pairs) if result.get('train_qa_pairs') else 0
|
|
65
|
+
test_pairs = len(result['test_qa_pairs'].pairs) if result.get('test_qa_pairs') else 0
|
|
66
|
+
total_pairs = train_pairs + test_pairs
|
|
67
|
+
|
|
68
|
+
return total_pairs
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error counting pairs for {task_name}: {e}", file=sys.stderr)
|
|
72
|
+
return -1
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def main():
|
|
76
|
+
benchmarks = load_benchmarks()
|
|
77
|
+
|
|
78
|
+
print(f"Counting maximum contrastive pairs for {len(benchmarks)} benchmarks...\n")
|
|
79
|
+
|
|
80
|
+
results = {}
|
|
81
|
+
|
|
82
|
+
for i, benchmark in enumerate(benchmarks, 1):
|
|
83
|
+
print(f"[{i}/{len(benchmarks)}] Counting {benchmark}...", end=" ", flush=True)
|
|
84
|
+
|
|
85
|
+
count = count_pairs(benchmark)
|
|
86
|
+
results[benchmark] = count
|
|
87
|
+
|
|
88
|
+
if count >= 0:
|
|
89
|
+
print(f"{count:,} pairs")
|
|
90
|
+
else:
|
|
91
|
+
print("ERROR")
|
|
92
|
+
|
|
93
|
+
# Save results
|
|
94
|
+
output_file = Path(__file__).parent / "benchmark_pair_counts.json"
|
|
95
|
+
with open(output_file, 'w') as f:
|
|
96
|
+
json.dump(results, f, indent=2, sort_keys=True)
|
|
97
|
+
|
|
98
|
+
print(f"\nResults saved to {output_file}")
|
|
99
|
+
|
|
100
|
+
# Print summary
|
|
101
|
+
print("\n" + "="*70)
|
|
102
|
+
print("SUMMARY")
|
|
103
|
+
print("="*70)
|
|
104
|
+
|
|
105
|
+
successful = {k: v for k, v in results.items() if v >= 0}
|
|
106
|
+
failed = {k: v for k, v in results.items() if v < 0}
|
|
107
|
+
|
|
108
|
+
print(f"Successfully counted: {len(successful)}/{len(benchmarks)}")
|
|
109
|
+
print(f"Failed: {len(failed)}/{len(benchmarks)}")
|
|
110
|
+
|
|
111
|
+
if successful:
|
|
112
|
+
total = sum(successful.values())
|
|
113
|
+
avg = total / len(successful)
|
|
114
|
+
print(f"\nTotal pairs across all benchmarks: {total:,}")
|
|
115
|
+
print(f"Average pairs per benchmark: {avg:,.0f}")
|
|
116
|
+
print(f"Max pairs: {max(successful.values()):,} ({max(successful, key=successful.get)})")
|
|
117
|
+
print(f"Min pairs: {min(successful.values()):,} ({min(successful, key=successful.get)})")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == "__main__":
|
|
121
|
+
main()
|