wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/comparison/__init__.py +1 -0
- wisent/comparison/detect_bos_features.py +275 -0
- wisent/comparison/fgaa.py +465 -0
- wisent/comparison/lora.py +669 -0
- wisent/comparison/lora_dpo.py +592 -0
- wisent/comparison/main.py +444 -0
- wisent/comparison/ours.py +76 -0
- wisent/comparison/sae.py +304 -0
- wisent/comparison/utils.py +381 -0
- wisent/core/activations/activation_cache.py +393 -0
- wisent/core/activations/activations.py +3 -3
- wisent/core/activations/activations_collector.py +12 -7
- wisent/core/activations/classifier_inference_strategy.py +12 -11
- wisent/core/activations/extraction_strategy.py +260 -84
- wisent/core/classifiers/classifiers/core/atoms.py +3 -2
- wisent/core/cli/__init__.py +2 -1
- wisent/core/cli/agent/train_classifier.py +16 -3
- wisent/core/cli/check_linearity.py +35 -3
- wisent/core/cli/cluster_benchmarks.py +4 -6
- wisent/core/cli/create_steering_vector.py +6 -4
- wisent/core/cli/diagnose_vectors.py +7 -4
- wisent/core/cli/estimate_unified_goodness_time.py +6 -4
- wisent/core/cli/generate_pairs_from_task.py +9 -56
- wisent/core/cli/generate_vector_from_task.py +11 -20
- wisent/core/cli/geometry_search.py +137 -0
- wisent/core/cli/get_activations.py +2 -2
- wisent/core/cli/method_optimizer.py +4 -3
- wisent/core/cli/modify_weights.py +3 -2
- wisent/core/cli/optimize_sample_size.py +1 -1
- wisent/core/cli/optimize_steering.py +14 -16
- wisent/core/cli/optimize_weights.py +2 -1
- wisent/core/cli/preview_pairs.py +203 -0
- wisent/core/cli/steering_method_trainer.py +3 -3
- wisent/core/cli/tasks.py +19 -76
- wisent/core/cli/train_unified_goodness.py +3 -3
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
- wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +22 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +10 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +9 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
- wisent/core/data_loaders/loaders/lm_loader.py +12 -1
- wisent/core/geometry_runner.py +995 -0
- wisent/core/geometry_search_space.py +237 -0
- wisent/core/hyperparameter_optimizer.py +1 -1
- wisent/core/main.py +3 -0
- wisent/core/models/core/atoms.py +5 -3
- wisent/core/models/wisent_model.py +1 -1
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
- wisent/core/parser_arguments/check_linearity_parser.py +12 -2
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +6 -13
- wisent/core/parser_arguments/geometry_search_parser.py +61 -0
- wisent/core/parser_arguments/get_activations_parser.py +5 -14
- wisent/core/parser_arguments/main_parser.py +8 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
- wisent/core/steering.py +5 -3
- wisent/core/steering_methods/methods/hyperplane.py +2 -1
- wisent/core/synthetic/generators/nonsense_generator.py +30 -18
- wisent/core/trainers/steering_trainer.py +2 -2
- wisent/core/utils/device.py +27 -27
- wisent/core/utils/layer_combinations.py +70 -0
- wisent/examples/__init__.py +1 -0
- wisent/examples/scripts/__init__.py +1 -0
- wisent/examples/scripts/count_all_benchmarks.py +121 -0
- wisent/examples/scripts/discover_directions.py +469 -0
- wisent/examples/scripts/extract_benchmark_info.py +71 -0
- wisent/examples/scripts/search_all_short_names.py +31 -0
- wisent/examples/scripts/test_all_benchmarks.py +138 -0
- wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
- wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
- wisent/examples/scripts/test_nonsense_baseline.py +261 -0
- wisent/examples/scripts/test_one_benchmark.py +324 -0
- wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
- wisent/parameters/lm_eval/category_directions.json +137 -0
- wisent/parameters/lm_eval/repair_plan.json +282 -0
- wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
- wisent/parameters/lm_eval/working_benchmarks.json +206 -0
- wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
- wisent/tests/test_detector_accuracy.py +1 -1
- wisent/tests/visualize_geometry.py +1 -1
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/METADATA +5 -1
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/RECORD +328 -358
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
- wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
- wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
- wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
- wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cola_pairs.json +0 -8
- wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/2/test_atis_pairs.json +0 -8
- wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/2/test_babi_pairs.json +0 -8
- wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/WHEEL +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.701.dist-info → wisent-0.7.1045.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ from wisent.core.cli_logger import setup_logger
|
|
|
6
6
|
from wisent.core.contrastive_pairs.core.pair import ContrastivePair
|
|
7
7
|
from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
|
|
8
8
|
|
|
9
|
-
__all__ = ["BrowseCompExtractor"
|
|
9
|
+
__all__ = ["BrowseCompExtractor"]
|
|
10
10
|
|
|
11
11
|
log = setup_logger(__name__)
|
|
12
12
|
|
|
@@ -44,6 +44,8 @@ class BrowseCompExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
44
44
|
"""
|
|
45
45
|
Build contrastive pairs from BrowseComp examples.
|
|
46
46
|
|
|
47
|
+
Uses Tevatron/browsecomp-plus dataset from HuggingFace.
|
|
48
|
+
|
|
47
49
|
Args:
|
|
48
50
|
limit: Optional maximum number of pairs to produce.
|
|
49
51
|
|
|
@@ -51,74 +53,47 @@ class BrowseCompExtractor(HuggingFaceBenchmarkExtractor):
|
|
|
51
53
|
A list of ContrastivePair objects.
|
|
52
54
|
"""
|
|
53
55
|
max_items = self._normalize_limit(limit)
|
|
56
|
+
pairs: list[ContrastivePair] = []
|
|
54
57
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
+
try:
|
|
59
|
+
docs = self.load_dataset(
|
|
60
|
+
dataset_name="Tevatron/browsecomp-plus",
|
|
61
|
+
split="test",
|
|
62
|
+
limit=max_items,
|
|
63
|
+
)
|
|
64
|
+
log.info(f"Loaded {len(docs)} examples from browsecomp-plus")
|
|
58
65
|
|
|
59
|
-
|
|
66
|
+
for doc in docs:
|
|
67
|
+
pair = self._extract_pair_from_doc(doc)
|
|
68
|
+
if pair is not None:
|
|
69
|
+
pairs.append(pair)
|
|
70
|
+
if max_items is not None and len(pairs) >= max_items:
|
|
71
|
+
break
|
|
60
72
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
pairs.append(pair)
|
|
65
|
-
if max_items is not None and len(pairs) >= max_items:
|
|
66
|
-
break
|
|
73
|
+
except Exception as e:
|
|
74
|
+
log.error(f"Failed to load browsecomp-plus: {e}")
|
|
75
|
+
return []
|
|
67
76
|
|
|
68
77
|
if not pairs:
|
|
69
78
|
log.warning("No valid BrowseComp pairs extracted")
|
|
70
79
|
|
|
71
80
|
return pairs
|
|
72
81
|
|
|
73
|
-
def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
|
|
74
|
-
"""Create synthetic BrowseComp examples."""
|
|
75
|
-
examples = [
|
|
76
|
-
{
|
|
77
|
-
"query": "Find the current market cap of Apple Inc.",
|
|
78
|
-
"correct_answer": "Apple Inc.'s market capitalization can be found on financial websites like Yahoo Finance or Google Finance. As of the most recent data, it is approximately $3 trillion, though this fluctuates with stock price.",
|
|
79
|
-
"incorrect_answer": "Apple's market cap is $500 billion. I found this on a random website that might not be up to date.",
|
|
80
|
-
"task_type": "information_retrieval",
|
|
81
|
-
},
|
|
82
|
-
{
|
|
83
|
-
"query": "What are the top 3 most visited websites in the world?",
|
|
84
|
-
"correct_answer": "According to recent web traffic data from sources like SimilarWeb or Alexa, the top 3 most visited websites are: 1) Google.com, 2) YouTube.com, 3) Facebook.com. These rankings can vary slightly depending on the source and time period.",
|
|
85
|
-
"incorrect_answer": "The most visited websites are MySpace, Yahoo, and AOL based on what I remember.",
|
|
86
|
-
"task_type": "fact_finding",
|
|
87
|
-
},
|
|
88
|
-
{
|
|
89
|
-
"query": "Find the official documentation for Python's asyncio library.",
|
|
90
|
-
"correct_answer": "The official Python asyncio documentation is available at docs.python.org/3/library/asyncio.html. It covers the asyncio module for writing concurrent code using the async/await syntax, including sections on coroutines, tasks, streams, and synchronization primitives.",
|
|
91
|
-
"incorrect_answer": "I think there's some asyncio documentation somewhere on the internet. You can probably find it by searching.",
|
|
92
|
-
"task_type": "documentation_search",
|
|
93
|
-
},
|
|
94
|
-
{
|
|
95
|
-
"query": "What is the current weather in Tokyo, Japan?",
|
|
96
|
-
"correct_answer": "To find current weather in Tokyo, I would check weather.com, accuweather.com, or the Japan Meteorological Agency website (jma.go.jp). These provide real-time weather data including temperature, humidity, and forecasts for Tokyo.",
|
|
97
|
-
"incorrect_answer": "Tokyo weather is always around 20°C year-round because Japan has a mild climate.",
|
|
98
|
-
"task_type": "real_time_information",
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
"query": "Find research papers about transformer architecture in machine learning.",
|
|
102
|
-
"correct_answer": "The foundational paper is 'Attention Is All You Need' by Vaswani et al. (2017), available on arXiv (arxiv.org/abs/1706.03762). For more recent research, Google Scholar, arXiv, and Semantic Scholar provide comprehensive collections of transformer-related papers including BERT, GPT, and their variants.",
|
|
103
|
-
"incorrect_answer": "There are some papers about transformers. They use attention which is like focusing on things.",
|
|
104
|
-
"task_type": "academic_search",
|
|
105
|
-
},
|
|
106
|
-
]
|
|
107
|
-
|
|
108
|
-
result = []
|
|
109
|
-
for i in range(count):
|
|
110
|
-
example = examples[i % len(examples)].copy()
|
|
111
|
-
result.append(example)
|
|
112
|
-
|
|
113
|
-
return result
|
|
114
|
-
|
|
115
82
|
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
116
|
-
"""Convert a single doc into a ContrastivePair.
|
|
83
|
+
"""Convert a single doc into a ContrastivePair.
|
|
84
|
+
|
|
85
|
+
browsecomp-plus schema:
|
|
86
|
+
- query_id: str
|
|
87
|
+
- query: str (the question)
|
|
88
|
+
- answer: str (ground truth answer)
|
|
89
|
+
- evidence_docs: list (documents with evidence)
|
|
90
|
+
- gold_docs: list (gold standard documents)
|
|
91
|
+
- negative_docs: list (distractor documents)
|
|
92
|
+
"""
|
|
117
93
|
try:
|
|
118
94
|
query = doc.get("query", "").strip()
|
|
119
|
-
correct_answer = doc.get("
|
|
120
|
-
|
|
121
|
-
task_type = doc.get("task_type", "general")
|
|
95
|
+
correct_answer = doc.get("answer", "").strip()
|
|
96
|
+
query_id = doc.get("query_id", "")
|
|
122
97
|
|
|
123
98
|
if not query or not correct_answer:
|
|
124
99
|
return None
|
|
@@ -131,10 +106,13 @@ Please search the web and provide accurate, up-to-date information. Include:
|
|
|
131
106
|
- Relevant details and context
|
|
132
107
|
- Any caveats about data freshness"""
|
|
133
108
|
|
|
109
|
+
# Create incorrect answer (opposite or unrelated)
|
|
110
|
+
incorrect_answer = f"I could not find relevant information about this query."
|
|
111
|
+
|
|
134
112
|
metadata = {
|
|
135
113
|
"label": "browsecomp",
|
|
136
|
-
"source": "browsecomp",
|
|
137
|
-
"
|
|
114
|
+
"source": "Tevatron/browsecomp-plus",
|
|
115
|
+
"query_id": query_id,
|
|
138
116
|
"language": self.language,
|
|
139
117
|
"is_web_browsing_benchmark": True,
|
|
140
118
|
}
|
|
@@ -152,293 +130,5 @@ Please search the web and provide accurate, up-to-date information. Include:
|
|
|
152
130
|
|
|
153
131
|
|
|
154
132
|
|
|
155
|
-
class SealExtractor(HuggingFaceBenchmarkExtractor):
|
|
156
|
-
"""
|
|
157
|
-
Extractor for Seal-0 - agentic search evaluation benchmark.
|
|
158
|
-
|
|
159
|
-
Seal evaluates LLMs' ability to perform complex multi-step search tasks
|
|
160
|
-
that require planning, tool use, and information synthesis.
|
|
161
|
-
|
|
162
|
-
For agentic search evaluation:
|
|
163
|
-
- Positive (correct) = Successful multi-step search with correct synthesis
|
|
164
|
-
- Negative (incorrect) = Failed search or incorrect information gathering
|
|
165
|
-
"""
|
|
166
|
-
|
|
167
|
-
# Evaluator that should be used for this benchmark
|
|
168
|
-
evaluator_name = "agentic_search"
|
|
169
|
-
|
|
170
|
-
def extract_contrastive_pairs(
|
|
171
|
-
self,
|
|
172
|
-
limit: int | None = None,
|
|
173
|
-
) -> list[ContrastivePair]:
|
|
174
|
-
"""
|
|
175
|
-
Build contrastive pairs from Seal examples.
|
|
176
|
-
|
|
177
|
-
Args:
|
|
178
|
-
limit: Optional maximum number of pairs to produce.
|
|
179
|
-
|
|
180
|
-
Returns:
|
|
181
|
-
A list of ContrastivePair objects.
|
|
182
|
-
"""
|
|
183
|
-
max_items = self._normalize_limit(limit)
|
|
184
|
-
|
|
185
|
-
# Create synthetic examples based on agentic search patterns
|
|
186
|
-
docs = self._create_synthetic_examples(max_items or 100)
|
|
187
|
-
|
|
188
|
-
pairs: list[ContrastivePair] = []
|
|
189
|
-
|
|
190
|
-
for doc in docs:
|
|
191
|
-
pair = self._extract_pair_from_doc(doc)
|
|
192
|
-
if pair is not None:
|
|
193
|
-
pairs.append(pair)
|
|
194
|
-
if max_items is not None and len(pairs) >= max_items:
|
|
195
|
-
break
|
|
196
|
-
|
|
197
|
-
if not pairs:
|
|
198
|
-
log.warning("No valid Seal pairs extracted")
|
|
199
|
-
|
|
200
|
-
return pairs
|
|
201
|
-
|
|
202
|
-
def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
|
|
203
|
-
"""Create synthetic Seal-style examples."""
|
|
204
|
-
examples = [
|
|
205
|
-
{
|
|
206
|
-
"task": "Compare the revenue growth of the top 3 cloud providers over the past 5 years.",
|
|
207
|
-
"correct_response": """To compare cloud provider revenue growth, I need to:
|
|
208
|
-
1. Search for AWS, Azure, and Google Cloud revenue data
|
|
209
|
-
2. Find historical data from 2019-2024
|
|
210
|
-
3. Calculate growth rates and compare
|
|
211
|
-
|
|
212
|
-
Based on financial reports and industry analysis:
|
|
213
|
-
- AWS: Grew from ~$35B (2019) to ~$90B (2024), ~157% growth
|
|
214
|
-
- Azure: Grew from ~$17B (2019) to ~$65B (2024), ~282% growth
|
|
215
|
-
- Google Cloud: Grew from ~$8B (2019) to ~$36B (2024), ~350% growth
|
|
216
|
-
|
|
217
|
-
Key insight: While AWS remains the market leader in absolute revenue, Azure and Google Cloud have shown faster percentage growth rates.""",
|
|
218
|
-
"incorrect_response": "The cloud providers are all doing well. They make a lot of money from cloud services.",
|
|
219
|
-
"complexity": "multi_step",
|
|
220
|
-
},
|
|
221
|
-
{
|
|
222
|
-
"task": "Find and summarize the latest developments in quantum computing from the past month.",
|
|
223
|
-
"correct_response": """I'll search for recent quantum computing news from multiple sources:
|
|
224
|
-
|
|
225
|
-
Recent developments (November 2024):
|
|
226
|
-
1. IBM announced their 1000+ qubit processor milestone
|
|
227
|
-
2. Google published new research on quantum error correction
|
|
228
|
-
3. IonQ demonstrated improved gate fidelities in trapped-ion systems
|
|
229
|
-
4. Microsoft Azure Quantum expanded their cloud platform capabilities
|
|
230
|
-
|
|
231
|
-
Sources consulted: Nature, IEEE Spectrum, company press releases, arXiv preprints.
|
|
232
|
-
|
|
233
|
-
The trend shows continued progress in error correction and qubit count, moving closer to practical quantum advantage for specific applications.""",
|
|
234
|
-
"incorrect_response": "Quantum computers are getting faster. They use qubits which are like regular bits but quantum.",
|
|
235
|
-
"complexity": "multi_source",
|
|
236
|
-
},
|
|
237
|
-
]
|
|
238
|
-
|
|
239
|
-
result = []
|
|
240
|
-
for i in range(count):
|
|
241
|
-
example = examples[i % len(examples)].copy()
|
|
242
|
-
result.append(example)
|
|
243
|
-
|
|
244
|
-
return result
|
|
245
|
-
|
|
246
|
-
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
247
|
-
"""Convert a single doc into a ContrastivePair."""
|
|
248
|
-
try:
|
|
249
|
-
task = doc.get("task", "").strip()
|
|
250
|
-
correct = doc.get("correct_response", "").strip()
|
|
251
|
-
incorrect = doc.get("incorrect_response", "").strip()
|
|
252
|
-
complexity = doc.get("complexity", "standard")
|
|
253
|
-
|
|
254
|
-
if not task or not correct:
|
|
255
|
-
return None
|
|
256
|
-
|
|
257
|
-
task_prompt = f"""Agentic Search Task: {task}
|
|
258
|
-
|
|
259
|
-
You have access to web search capabilities. Please:
|
|
260
|
-
1. Plan your search strategy
|
|
261
|
-
2. Execute the necessary searches
|
|
262
|
-
3. Synthesize the information into a coherent response
|
|
263
|
-
4. Cite your sources where applicable"""
|
|
264
|
-
|
|
265
|
-
metadata = {
|
|
266
|
-
"label": "seal_0",
|
|
267
|
-
"source": "seal_0",
|
|
268
|
-
"complexity": complexity,
|
|
269
|
-
"is_agentic_search_benchmark": True,
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
return self._build_pair(
|
|
273
|
-
question=task_prompt,
|
|
274
|
-
correct=correct,
|
|
275
|
-
incorrect=incorrect,
|
|
276
|
-
metadata=metadata,
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
except Exception as exc:
|
|
280
|
-
log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
|
|
281
|
-
return None
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
class FinSearchCompExtractor(HuggingFaceBenchmarkExtractor):
|
|
286
|
-
"""
|
|
287
|
-
Extractor for FinSearchComp - financial search agent benchmark.
|
|
288
|
-
|
|
289
|
-
FinSearchComp evaluates LLMs' ability to find and analyze financial
|
|
290
|
-
information, including stock data, financial reports, market analysis,
|
|
291
|
-
and regulatory filings.
|
|
292
|
-
|
|
293
|
-
For financial search evaluation:
|
|
294
|
-
- Positive (correct) = Accurate financial data with proper sourcing
|
|
295
|
-
- Negative (incorrect) = Inaccurate data or unsourced claims
|
|
296
|
-
"""
|
|
297
|
-
|
|
298
|
-
# Evaluator that should be used for this benchmark
|
|
299
|
-
evaluator_name = "financial_search"
|
|
300
|
-
|
|
301
|
-
def extract_contrastive_pairs(
|
|
302
|
-
self,
|
|
303
|
-
limit: int | None = None,
|
|
304
|
-
) -> list[ContrastivePair]:
|
|
305
|
-
"""
|
|
306
|
-
Build contrastive pairs from FinSearchComp examples.
|
|
307
|
-
|
|
308
|
-
Args:
|
|
309
|
-
limit: Optional maximum number of pairs to produce.
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
A list of ContrastivePair objects.
|
|
313
|
-
"""
|
|
314
|
-
max_items = self._normalize_limit(limit)
|
|
315
|
-
|
|
316
|
-
# Create synthetic financial search examples
|
|
317
|
-
docs = self._create_synthetic_examples(max_items or 100)
|
|
318
|
-
|
|
319
|
-
pairs: list[ContrastivePair] = []
|
|
320
|
-
|
|
321
|
-
for doc in docs:
|
|
322
|
-
pair = self._extract_pair_from_doc(doc)
|
|
323
|
-
if pair is not None:
|
|
324
|
-
pairs.append(pair)
|
|
325
|
-
if max_items is not None and len(pairs) >= max_items:
|
|
326
|
-
break
|
|
327
|
-
|
|
328
|
-
if not pairs:
|
|
329
|
-
log.warning("No valid FinSearchComp pairs extracted")
|
|
330
|
-
|
|
331
|
-
return pairs
|
|
332
|
-
|
|
333
|
-
def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
|
|
334
|
-
"""Create synthetic FinSearchComp examples."""
|
|
335
|
-
examples = [
|
|
336
|
-
{
|
|
337
|
-
"query": "What is NVIDIA's P/E ratio and how does it compare to the semiconductor industry average?",
|
|
338
|
-
"correct_answer": """Based on financial data sources (Yahoo Finance, Bloomberg):
|
|
339
|
-
|
|
340
|
-
NVIDIA's current P/E ratio: Approximately 65-70x (trailing twelve months)
|
|
341
|
-
Semiconductor industry average P/E: Approximately 25-30x
|
|
342
|
-
|
|
343
|
-
Analysis: NVIDIA trades at a significant premium to the industry average, reflecting:
|
|
344
|
-
1. Strong growth expectations from AI/datacenter demand
|
|
345
|
-
2. Market leadership in GPU technology
|
|
346
|
-
3. High revenue growth rates (>100% YoY in recent quarters)
|
|
347
|
-
|
|
348
|
-
Note: P/E ratios fluctuate with stock price and should be verified with real-time data from financial terminals.""",
|
|
349
|
-
"incorrect_answer": "NVIDIA has a P/E ratio which is a number that shows something about the stock price.",
|
|
350
|
-
"category": "valuation",
|
|
351
|
-
},
|
|
352
|
-
{
|
|
353
|
-
"query": "Find the key metrics from Tesla's latest quarterly earnings report.",
|
|
354
|
-
"correct_answer": """Tesla Q3 2024 Earnings Highlights (Source: Tesla Investor Relations, SEC 10-Q):
|
|
355
|
-
|
|
356
|
-
Revenue: $25.18 billion (+8% YoY)
|
|
357
|
-
Automotive Revenue: $20.02 billion
|
|
358
|
-
Energy & Services Revenue: $5.16 billion
|
|
359
|
-
|
|
360
|
-
Profitability:
|
|
361
|
-
- Operating Margin: 10.8%
|
|
362
|
-
- Net Income: $2.17 billion
|
|
363
|
-
- EPS: $0.62
|
|
364
|
-
|
|
365
|
-
Vehicle Deliveries: 462,890 units
|
|
366
|
-
- Model 3/Y: 439,975
|
|
367
|
-
- Other models: 22,915
|
|
368
|
-
|
|
369
|
-
Key Highlights:
|
|
370
|
-
- Energy storage deployments reached record 6.9 GWh
|
|
371
|
-
- Cybertruck production ramping
|
|
372
|
-
- FSD revenue recognition increasing
|
|
373
|
-
|
|
374
|
-
Source: Tesla Q3 2024 Update, October 2024""",
|
|
375
|
-
"incorrect_answer": "Tesla made some money last quarter. They sell cars and batteries.",
|
|
376
|
-
"category": "earnings",
|
|
377
|
-
},
|
|
378
|
-
{
|
|
379
|
-
"query": "What are the current interest rate expectations for the Federal Reserve?",
|
|
380
|
-
"correct_answer": """Federal Reserve Interest Rate Outlook (Sources: CME FedWatch, Bloomberg):
|
|
381
|
-
|
|
382
|
-
Current Federal Funds Rate: 4.50-4.75%
|
|
383
|
-
|
|
384
|
-
Market Expectations (as of late 2024):
|
|
385
|
-
- December 2024: 75% probability of 25bp cut
|
|
386
|
-
- January 2025: 60% probability of another 25bp cut
|
|
387
|
-
- End of 2025: Terminal rate expected around 3.25-3.50%
|
|
388
|
-
|
|
389
|
-
Key Factors Driving Expectations:
|
|
390
|
-
1. Inflation trending toward 2% target
|
|
391
|
-
2. Labor market showing signs of cooling
|
|
392
|
-
3. Fed's stated data-dependent approach
|
|
393
|
-
4. Recent Fedspeak suggesting gradual easing path
|
|
394
|
-
|
|
395
|
-
Note: Interest rate expectations change with economic data releases. Verify with real-time Fed Funds futures.""",
|
|
396
|
-
"incorrect_answer": "The Fed sets interest rates. They might change them sometime.",
|
|
397
|
-
"category": "macro",
|
|
398
|
-
},
|
|
399
|
-
]
|
|
400
|
-
|
|
401
|
-
result = []
|
|
402
|
-
for i in range(count):
|
|
403
|
-
example = examples[i % len(examples)].copy()
|
|
404
|
-
result.append(example)
|
|
405
|
-
|
|
406
|
-
return result
|
|
407
|
-
|
|
408
|
-
def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
|
|
409
|
-
"""Convert a single doc into a ContrastivePair."""
|
|
410
|
-
try:
|
|
411
|
-
query = doc.get("query", "").strip()
|
|
412
|
-
correct = doc.get("correct_answer", "").strip()
|
|
413
|
-
incorrect = doc.get("incorrect_answer", "").strip()
|
|
414
|
-
category = doc.get("category", "general")
|
|
415
|
-
|
|
416
|
-
if not query or not correct:
|
|
417
|
-
return None
|
|
418
|
-
|
|
419
|
-
task_prompt = f"""Financial Search Task: {query}
|
|
420
133
|
|
|
421
|
-
Please search for financial data and provide:
|
|
422
|
-
- Specific numbers and metrics where applicable
|
|
423
|
-
- Sources for your data
|
|
424
|
-
- Context and analysis
|
|
425
|
-
- Any caveats about data freshness"""
|
|
426
|
-
|
|
427
|
-
metadata = {
|
|
428
|
-
"label": "finsearchcomp",
|
|
429
|
-
"source": "finsearchcomp",
|
|
430
|
-
"category": category,
|
|
431
|
-
"is_financial_search_benchmark": True,
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
return self._build_pair(
|
|
435
|
-
question=task_prompt,
|
|
436
|
-
correct=correct,
|
|
437
|
-
incorrect=incorrect,
|
|
438
|
-
metadata=metadata,
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
except Exception as exc:
|
|
442
|
-
log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
|
|
443
|
-
return None
|
|
444
134
|
|