wisent 0.7.379__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +64 -0
- wisent/cli.py +114 -0
- wisent/core/__init__.py +40 -0
- wisent/core/activations/__init__.py +26 -0
- wisent/core/activations/activations.py +97 -0
- wisent/core/activations/activations_collector.py +506 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +219 -0
- wisent/core/activations/prompt_construction_strategy.py +47 -0
- wisent/core/adapters/__init__.py +22 -0
- wisent/core/adapters/audio.py +616 -0
- wisent/core/adapters/base.py +420 -0
- wisent/core/adapters/multimodal.py +738 -0
- wisent/core/adapters/robotics.py +643 -0
- wisent/core/adapters/text.py +441 -0
- wisent/core/adapters/video.py +555 -0
- wisent/core/agent/__init__.py +1 -0
- wisent/core/agent/budget.py +644 -0
- wisent/core/agent/device_benchmarks.py +691 -0
- wisent/core/agent/diagnose/__init__.py +1 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1155 -0
- wisent/core/agent/diagnose/response_diagnostics.py +273 -0
- wisent/core/agent/diagnose/select_classifiers.py +507 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +755 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1453 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose.py +249 -0
- wisent/core/agent/steer.py +215 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1158 -0
- wisent/core/benchmark_extractors.py +372 -0
- wisent/core/benchmark_registry.py +151 -0
- wisent/core/bigcode_extractors.py +26 -0
- wisent/core/bigcode_integration.py +886 -0
- wisent/core/branding.py +108 -0
- wisent/core/classifier/__init__.py +1 -0
- wisent/core/classifier/models/__init__.py +1 -0
- wisent/core/classifiers/__init__.py +1 -0
- wisent/core/classifiers/classifiers/__init__.py +0 -0
- wisent/core/classifiers/classifiers/core/__init__.py +0 -0
- wisent/core/classifiers/classifiers/core/atoms.py +748 -0
- wisent/core/classifiers/classifiers/models/__init__.py +0 -0
- wisent/core/classifiers/classifiers/models/logistic.py +29 -0
- wisent/core/classifiers/classifiers/models/mlp.py +47 -0
- wisent/core/classifiers/classifiers/rotator.py +137 -0
- wisent/core/classifiers/core/__init__.py +1 -0
- wisent/core/classifiers/models/__init__.py +1 -0
- wisent/core/classifiers/pipeline_steps/__init__.py +1 -0
- wisent/core/cli/__init__.py +26 -0
- wisent/core/cli/agent/__init__.py +15 -0
- wisent/core/cli/agent/apply_steering.py +192 -0
- wisent/core/cli/agent/evaluate_response.py +128 -0
- wisent/core/cli/agent/generate_synthetic_pairs.py +123 -0
- wisent/core/cli/agent/main.py +139 -0
- wisent/core/cli/agent/train_classifier.py +173 -0
- wisent/core/cli/check_linearity.py +126 -0
- wisent/core/cli/create_steering_vector.py +304 -0
- wisent/core/cli/diagnose_pairs.py +153 -0
- wisent/core/cli/diagnose_vectors.py +404 -0
- wisent/core/cli/estimate_unified_goodness_time.py +428 -0
- wisent/core/cli/evaluate_refusal.py +241 -0
- wisent/core/cli/evaluate_responses.py +926 -0
- wisent/core/cli/generate_humanization_pairs.py +128 -0
- wisent/core/cli/generate_pairs.py +175 -0
- wisent/core/cli/generate_pairs_from_task.py +108 -0
- wisent/core/cli/generate_responses.py +160 -0
- wisent/core/cli/generate_vector_from_synthetic.py +217 -0
- wisent/core/cli/generate_vector_from_task.py +248 -0
- wisent/core/cli/get_activations.py +192 -0
- wisent/core/cli/inference_config.py +84 -0
- wisent/core/cli/inference_config_cli.py +54 -0
- wisent/core/cli/modify_weights.py +660 -0
- wisent/core/cli/multi_steer.py +112 -0
- wisent/core/cli/optimization_cache.py +298 -0
- wisent/core/cli/optimize.py +621 -0
- wisent/core/cli/optimize_classification.py +473 -0
- wisent/core/cli/optimize_sample_size.py +390 -0
- wisent/core/cli/optimize_steering.py +3421 -0
- wisent/core/cli/optimize_weights.py +1287 -0
- wisent/core/cli/steering_method_trainer.py +641 -0
- wisent/core/cli/steering_search_space.py +508 -0
- wisent/core/cli/tasks.py +940 -0
- wisent/core/cli/train_unified_goodness.py +681 -0
- wisent/core/cli_logger.py +22 -0
- wisent/core/config_manager.py +1731 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +183 -0
- wisent/core/contrastive_pairs/core/response.py +153 -0
- wisent/core/contrastive_pairs/core/serialization.py +306 -0
- wisent/core/contrastive_pairs/core/set.py +192 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1655 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +118 -0
- wisent/core/contrastive_pairs/diagnostics/linearity.py +325 -0
- wisent/core/contrastive_pairs/diagnostics/vector_quality.py +620 -0
- wisent/core/contrastive_pairs/huggingface_pairs/__init__.py +1 -0
- wisent/core/contrastive_pairs/huggingface_pairs/atoms.py +255 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +470 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_registry.py +136 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +44 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentbench.py +225 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentharm.py +267 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +444 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +225 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime.py +118 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2024.py +74 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2025.py +73 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/alpaca_eval.py +153 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +182 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/arena_hard.py +179 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/atis.py +89 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/babilong.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bangla_mmlu.py +108 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/basqueglue.py +217 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bec2016eu.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bfcl.py +283 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bhtc_v2.py +87 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +245 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chain_of_thought.py +89 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chinese_simpleqa.py +209 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py +177 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +378 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +109 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +15 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +64 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +65 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +844 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coedit_gec.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/conala.py +133 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/concode.py +111 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/dbpedia_14.py +91 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/doc_vqa.py +102 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/donotanswer.py +236 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds1000.py +129 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds_1000.py +155 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/epec_koref_bin.py +85 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ethos_binary.py +82 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_mp.py +165 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_sp_sum_task_fp_small_p1.py +89 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/facts_grounding.py +181 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +295 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/financial_tweets.py +100 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +270 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flan_held_in.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +572 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +143 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_negative_example_livecodebench.py +146 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_positive_example_livecodebench.py +140 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/gpt3_translation_benchmarks.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +389 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py +246 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/harmbench.py +250 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/healthbench.py +181 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hle.py +106 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hmmt.py +117 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +119 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humanevalpack.py +102 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +180 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +129 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_ar_en.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_en_ar.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/jailbreakbench.py +258 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/law_stack_exchange.py +101 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ledgar.py +118 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench.py +61 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_contrastive_pair_generator.py +491 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_v6.py +263 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +230 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/llama.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +285 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/m_mmlu.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math.py +186 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +146 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +142 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/meddialog.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medical_abstracts.py +101 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +787 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +111 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlu_redux.py +194 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlusr.py +108 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multimedqa.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multipl_e.py +109 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple.py +96 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_choice.py +87 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_cpp.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_go.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_java.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_js.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_py.py +15 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_rs.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/non_greedy_robustness_agieval_aqua_rat.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +287 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/openllm.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/option_order_robustness_agieval_aqua_rat.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/or_bench.py +300 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/penn_treebank.py +80 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +317 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +467 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/prompt_robustness_agieval_aqua_rat.py +92 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/pythia.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +131 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +280 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/scicode.py +275 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/self_consistency.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +145 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sorry_bench.py +211 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/stsb.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1.py +99 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1_seq2seq.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_t5_prompt.py +123 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_gpqa.py +106 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench.py +428 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench_verified.py +158 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sycophancy_eval.py +205 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/t0_eval.py +79 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tag.py +98 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +305 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tmlu.py +109 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +360 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +386 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/travelplanner.py +286 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/truthfulqa_generation.py +128 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/unfair_tos.py +83 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/vaxx_stance.py +86 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wiceu.py +85 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wikitext103.py +97 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wildguard.py +280 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_en_fr.py +97 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_fr_en.py +97 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_de_en.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_de.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_ro.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_ro_en.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt_ro_en_t5_prompt.py +90 -0
- wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/xsum.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +265 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/__init__.py +472 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aclue.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acp.py +33 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acpbench.py +39 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/advanced_ai_risk.py +59 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aexams.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimgsm.py +10 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimmlu.py +10 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrixnli.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_adr.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afriqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afrisenti.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_belebele.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_flores.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_injongointent.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_mafand.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhaner.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhanews.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhapos.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_naijarc.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_nollysenti.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_ntrex.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_openai_mmlu.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_salt.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_sib.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_uhura_arc_easy.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_xlsum.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/agieval.py +33 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/anli.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arab_culture.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva.py +67 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva_light.py +67 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_complete.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_light.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabicmmlu.py +59 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aradice.py +36 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arc.py +61 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arithmetic.py +19 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/basque_bench.py +37 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbh.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbq.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/belebele.py +293 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bertaqa.py +25 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bigbench.py +300 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/blimp.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/careqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/catalan_bench.py +43 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ceval_valid.py +61 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/cmmlu.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/copal_id.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/crows_pairs.py +31 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/csatqa.py +15 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darija.py +29 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darijammlu.py +57 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/egymmlu.py +62 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/eus.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/evalita_mp.py +93 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/fld.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/flores.py +466 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/french_bench.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/galician_bench.py +41 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/glianorex.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/global_mmlu.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gpqa.py +27 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k_platinum.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/haerae.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/headqa.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hellaswag.py +39 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_ethics.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_math.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hrm8k.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/inverse.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/japanese_leaderboard.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/jsonschema_bench.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kbl.py +85 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kmmlu.py +281 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kobest.py +14 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kormedmcqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lambada.py +28 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/leaderboard.py +52 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/libra.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lingoly.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/longbench.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/m.py +43 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mastermind.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mathqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/med.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/meddialog.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/medqa.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mela.py +18 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/metabench.py +36 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mgsm.py +44 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/minerva_math.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mlqa.py +58 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu.py +70 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro_plus.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_prox.py +191 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlusr.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmmu.py +46 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/model_written_evals.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/multiblimp.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/non.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noreval.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noridiom.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nortruthfulqa.py +32 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nrk.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_arc_multilingual.py +10 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_hellaswag_multilingual.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_mmlu_multilingual.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_truthfulqa_multilingual.py +34 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/paloma.py +25 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pawsx.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/persona.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pile.py +31 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/polemo2.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/portuguese_bench.py +31 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/prompt.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qa4mre.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qasper.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ru.py +19 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ruler.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/score.py +20 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/scrolls.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/self_consistency.py +11 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/spanish_bench.py +38 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/storycloze.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/super_glue_t5_prompt.py +17 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tinyBenchmarks.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmlu.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmmluplus.py +80 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/translation.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa.py +76 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa_multi.py +24 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/turkishmmlu.py +30 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unitxt.py +23 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unscramble.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/winogender.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmdp.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt14.py +16 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt16.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wsc273.py +9 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xcopa.py +21 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli.py +28 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli_eu.py +12 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xquad.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xstorycloze.py +22 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xwinograd.py +15 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +478 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +207 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +185 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +130 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +184 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py +98 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_cot.py +88 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_mc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ag.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams1.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams2.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +98 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +104 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +167 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +268 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +133 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_gen.py +101 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_mc.py +106 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/argument.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +122 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/assin.py +103 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_gen.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_mc.py +139 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py +133 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +181 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py +170 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq_seq2seq.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +150 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabreu.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_gen.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_mc.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +139 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +223 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +163 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +151 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +166 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +148 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +149 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py +83 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +127 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +162 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqcat.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py +158 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle_letters.py +81 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +221 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +174 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +157 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +142 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +194 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/escola.py +85 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +135 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethos.py +99 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +225 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +159 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +159 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +159 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +166 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_sp.py +109 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py +105 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +202 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_mc.py +98 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_perplexity.py +86 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galcola.py +109 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_gen.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_mc.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +141 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +171 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py +109 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +184 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm.py +108 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +225 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +191 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +179 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hle.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +192 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/iwslt2017.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_gen.py +224 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +120 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py +123 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +168 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_cot.py +88 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_mc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +160 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py +147 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +185 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +185 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual_stablelm.py +141 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +194 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logieval.py +82 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py +137 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +123 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +224 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mediqa_qa2019.py +123 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +169 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medtext.py +108 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +96 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meqsum.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +154 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py +122 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +172 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py +143 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_cot.py +88 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_mc.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py +145 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +189 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py +150 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py +151 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog_perplexity.py +97 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +134 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multilingual.py +106 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +173 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +157 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen.py +277 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +165 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +228 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +223 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py +105 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +135 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py +27 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +167 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +174 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +162 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +209 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +186 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph_perplexity.py +97 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py +205 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +154 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +246 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +144 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases_ca_va.py +82 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py +135 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_gen.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_mc.py +103 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/quac.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +125 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +170 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +177 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +157 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +131 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +119 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +209 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_gen.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_mc.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py +121 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +250 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +107 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +154 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/superglue.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/supergpqa.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +179 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +113 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +181 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py +91 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py +149 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +130 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +112 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +120 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +140 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py +142 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +152 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +161 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_cot.py +104 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +102 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/twenty_newsgroups.py +111 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py +131 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +95 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +130 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +122 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py +146 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py +139 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt14.py +110 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt16.py +118 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +114 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +117 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +180 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +197 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +147 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +131 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +203 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +129 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +124 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/yahoo.py +108 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +155 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +56 -0
- wisent/core/data_loaders/__init__.py +235 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +99 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/huggingface_loader.py +153 -0
- wisent/core/data_loaders/loaders/lm_loader.py +494 -0
- wisent/core/data_loaders/loaders/lm_loader_special_cases.py +496 -0
- wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
- wisent/core/data_loaders/rotator.py +118 -0
- wisent/core/detection_handling.py +259 -0
- wisent/core/diversity_processors.py +193 -0
- wisent/core/download_full_benchmarks.py +1512 -0
- wisent/core/errors/__init__.py +203 -0
- wisent/core/errors/error_codes.py +763 -0
- wisent/core/errors/error_handler.py +134 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/__init__.py +42 -0
- wisent/core/evaluators/benchmark_specific/aime_evaluator.py +90 -0
- wisent/core/evaluators/benchmark_specific/coding/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/core/atoms.py +36 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +363 -0
- wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py +67 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py +126 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py +18 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py +31 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
- wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile +31 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py +0 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py +105 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py +143 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/entrypoint.py +121 -0
- wisent/core/evaluators/benchmark_specific/coding/safe_docker/recipes.py +60 -0
- wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
- wisent/core/evaluators/benchmark_specific/conala_evaluator.py +332 -0
- wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +81 -0
- wisent/core/evaluators/benchmark_specific/f1_evaluator.py +173 -0
- wisent/core/evaluators/benchmark_specific/generation_evaluator.py +488 -0
- wisent/core/evaluators/benchmark_specific/livemathbench_evaluator.py +393 -0
- wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +202 -0
- wisent/core/evaluators/benchmark_specific/math_evaluator.py +119 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/__init__.py +1 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/core.py +1640 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/extract_boxed.py +48 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/is_equiv.py +159 -0
- wisent/core/evaluators/benchmark_specific/math_parsing/scripts.py +919 -0
- wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +175 -0
- wisent/core/evaluators/benchmark_specific/polymath_evaluator.py +114 -0
- wisent/core/evaluators/core/__init__.py +5 -0
- wisent/core/evaluators/core/atoms.py +166 -0
- wisent/core/evaluators/custom/__init__.py +20 -0
- wisent/core/evaluators/custom/custom_evaluator.py +382 -0
- wisent/core/evaluators/custom/examples/__init__.py +37 -0
- wisent/core/evaluators/custom/examples/desklib_detector.py +166 -0
- wisent/core/evaluators/custom/examples/gptzero.py +185 -0
- wisent/core/evaluators/custom/examples/humanization.py +79 -0
- wisent/core/evaluators/custom/examples/humanization_coherent.py +127 -0
- wisent/core/evaluators/custom/examples/roberta_detector.py +173 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +168 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/evaluators/personalization/__init__.py +12 -0
- wisent/core/evaluators/personalization/alignment.py +166 -0
- wisent/core/evaluators/personalization/coherence.py +325 -0
- wisent/core/evaluators/personalization/difference.py +73 -0
- wisent/core/evaluators/rotator.py +217 -0
- wisent/core/evaluators/steering_evaluators.py +386 -0
- wisent/core/evaluators/synthetic_evaluator.py +377 -0
- wisent/core/hyperparameter_optimizer.py +547 -0
- wisent/core/layer.py +17 -0
- wisent/core/lm_eval_harness_ground_truth.py +1431 -0
- wisent/core/main.py +101 -0
- wisent/core/managed_cached_benchmarks.py +609 -0
- wisent/core/mixed_benchmark_sampler.py +366 -0
- wisent/core/modalities/__init__.py +545 -0
- wisent/core/model_persistence.py +302 -0
- wisent/core/models/__init__.py +23 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +465 -0
- wisent/core/models/inference_config.py +127 -0
- wisent/core/models/wisent_model.py +893 -0
- wisent/core/multi_steering.py +397 -0
- wisent/core/opti/__init__.py +0 -0
- wisent/core/opti/core/__init__.py +0 -0
- wisent/core/opti/core/atoms.py +177 -0
- wisent/core/opti/methods/__init__.py +10 -0
- wisent/core/opti/methods/opti_classificator.py +172 -0
- wisent/core/opti/methods/opti_steering.py +139 -0
- wisent/core/opti/methods/opti_weights.py +523 -0
- wisent/core/optuna/__init__.py +54 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +351 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +685 -0
- wisent/core/optuna/steering/__init__.py +20 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +200 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +412 -0
- wisent/core/optuna/steering/steering_optimization.py +1096 -0
- wisent/core/parser.py +1662 -0
- wisent/core/parser_arguments/__init__.py +10 -0
- wisent/core/parser_arguments/agent_parser.py +122 -0
- wisent/core/parser_arguments/check_linearity_parser.py +82 -0
- wisent/core/parser_arguments/configure_model_parser.py +7 -0
- wisent/core/parser_arguments/create_steering_vector_parser.py +67 -0
- wisent/core/parser_arguments/diagnose_pairs_parser.py +25 -0
- wisent/core/parser_arguments/diagnose_vectors_parser.py +72 -0
- wisent/core/parser_arguments/evaluate_parser.py +40 -0
- wisent/core/parser_arguments/evaluate_refusal_parser.py +32 -0
- wisent/core/parser_arguments/evaluate_responses_parser.py +12 -0
- wisent/core/parser_arguments/full_optimize_parser.py +194 -0
- wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
- wisent/core/parser_arguments/generate_pairs_parser.py +43 -0
- wisent/core/parser_arguments/generate_responses_parser.py +16 -0
- wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +148 -0
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +149 -0
- wisent/core/parser_arguments/generate_vector_parser.py +89 -0
- wisent/core/parser_arguments/get_activations_parser.py +90 -0
- wisent/core/parser_arguments/inference_config_parser.py +65 -0
- wisent/core/parser_arguments/main_parser.py +220 -0
- wisent/core/parser_arguments/model_config_parser.py +59 -0
- wisent/core/parser_arguments/modify_weights_parser.py +309 -0
- wisent/core/parser_arguments/monitor_parser.py +17 -0
- wisent/core/parser_arguments/multi_steer_parser.py +48 -0
- wisent/core/parser_arguments/nonsense_parser.py +26 -0
- wisent/core/parser_arguments/optimization_cache_parser.py +64 -0
- wisent/core/parser_arguments/optimize_classification_parser.py +108 -0
- wisent/core/parser_arguments/optimize_parser.py +142 -0
- wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
- wisent/core/parser_arguments/optimize_steering_parser.py +617 -0
- wisent/core/parser_arguments/optimize_weights_parser.py +403 -0
- wisent/core/parser_arguments/synthetic_parser.py +117 -0
- wisent/core/parser_arguments/tasks_parser.py +591 -0
- wisent/core/parser_arguments/train_unified_goodness_parser.py +172 -0
- wisent/core/parser_arguments/utils.py +107 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +148 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +26 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +26 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +31 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +33 -0
- wisent/core/representation.py +5 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +660 -0
- wisent/core/steering_method.py +20 -0
- wisent/core/steering_methods/__init__.py +54 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +154 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +45 -0
- wisent/core/steering_methods/methods/prism.py +588 -0
- wisent/core/steering_methods/methods/pulse.py +641 -0
- wisent/core/steering_methods/methods/titan.py +1005 -0
- wisent/core/steering_methods/preflight.py +322 -0
- wisent/core/steering_methods/registry.py +649 -0
- wisent/core/steering_methods/rotator.py +121 -0
- wisent/core/steering_optimizer.py +1503 -0
- wisent/core/synthetic/__init__.py +0 -0
- wisent/core/synthetic/cleaners/__init__.py +0 -0
- wisent/core/synthetic/cleaners/core/__init__.py +0 -0
- wisent/core/synthetic/cleaners/core/atoms.py +58 -0
- wisent/core/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/core/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/core/synthetic/cleaners/methods/base_dedupers.py +321 -0
- wisent/core/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/core/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/core/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/core/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/core/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/core/synthetic/db_instructions/__init__.py +0 -0
- wisent/core/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/core/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/core/synthetic/db_instructions/mini_dp.py +115 -0
- wisent/core/synthetic/generators/__init__.py +0 -0
- wisent/core/synthetic/generators/core/__init__.py +0 -0
- wisent/core/synthetic/generators/core/atoms.py +73 -0
- wisent/core/synthetic/generators/diversities/__init__.py +0 -0
- wisent/core/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/core/synthetic/generators/diversities/core/core.py +68 -0
- wisent/core/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/core/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/core/synthetic/generators/nonsense_generator.py +150 -0
- wisent/core/synthetic/generators/pairs_generator.py +313 -0
- wisent/core/task_interface.py +143 -0
- wisent/core/task_selector.py +232 -0
- wisent/core/tasks/__init__.py +218 -0
- wisent/core/tasks/aime_task.py +142 -0
- wisent/core/tasks/file_task.py +212 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +120 -0
- wisent/core/tasks/livecodebench_task.py +94 -0
- wisent/core/tasks/livemathbench_task.py +159 -0
- wisent/core/tasks/lm_eval_task.py +611 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +147 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +155 -0
- wisent/core/timing_calibration.py +176 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +620 -0
- wisent/core/tracking/memory.py +360 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +365 -0
- wisent/core/universal_subspace.py +918 -0
- wisent/core/user_model_config.py +158 -0
- wisent/core/utils/__init__.py +64 -0
- wisent/core/utils/base_rotator.py +292 -0
- wisent/core/utils/dataset_splits.py +197 -0
- wisent/core/utils/device.py +279 -0
- wisent/core/weight_modification/__init__.py +134 -0
- wisent/core/weight_modification/additive.py +340 -0
- wisent/core/weight_modification/directional.py +1357 -0
- wisent/core/weight_modification/export.py +359 -0
- wisent/core/weight_modification/multi_direction.py +410 -0
- wisent/core/weight_modification/utils.py +236 -0
- wisent/core/wisent.py +660 -0
- wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +2112 -0
- wisent/examples/scripts/1/test_basqueglue_evaluation.json +51 -0
- wisent/examples/scripts/1/test_basqueglue_pairs.json +14 -0
- wisent/examples/scripts/1/test_bec2016eu_evaluation.json +51 -0
- wisent/examples/scripts/1/test_bec2016eu_pairs.json +14 -0
- wisent/examples/scripts/1/test_belebele_evaluation.json +51 -0
- wisent/examples/scripts/1/test_belebele_pairs.json +14 -0
- wisent/examples/scripts/1/test_benchmarks_evaluation.json +51 -0
- wisent/examples/scripts/1/test_benchmarks_pairs.json +14 -0
- wisent/examples/scripts/1/test_bertaqa_evaluation.json +51 -0
- wisent/examples/scripts/1/test_bertaqa_pairs.json +14 -0
- wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +30 -0
- wisent/examples/scripts/1/test_bhtc_v2_pairs.json +8 -0
- wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +30 -0
- wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +8 -0
- wisent/examples/scripts/1/test_cabreu_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cabreu_pairs.json +8 -0
- wisent/examples/scripts/1/test_careqa_en_evaluation.json +30 -0
- wisent/examples/scripts/1/test_careqa_en_pairs.json +8 -0
- wisent/examples/scripts/1/test_careqa_evaluation.json +30 -0
- wisent/examples/scripts/1/test_careqa_pairs.json +8 -0
- wisent/examples/scripts/1/test_catalanqa_evaluation.json +30 -0
- wisent/examples/scripts/1/test_catalanqa_pairs.json +8 -0
- wisent/examples/scripts/1/test_catcola_evaluation.json +30 -0
- wisent/examples/scripts/1/test_catcola_pairs.json +8 -0
- wisent/examples/scripts/1/test_chartqa_evaluation.json +30 -0
- wisent/examples/scripts/1/test_chartqa_pairs.json +8 -0
- wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +30 -0
- wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +8 -0
- wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +8 -0
- wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cocoteros_es_pairs.json +8 -0
- wisent/examples/scripts/1/test_coedit_gec_evaluation.json +30 -0
- wisent/examples/scripts/1/test_coedit_gec_pairs.json +8 -0
- wisent/examples/scripts/1/test_cola_evaluation.json +30 -0
- wisent/examples/scripts/1/test_cola_pairs.json +8 -0
- wisent/examples/scripts/1/test_coqcat_evaluation.json +30 -0
- wisent/examples/scripts/1/test_coqcat_pairs.json +8 -0
- wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +30 -0
- wisent/examples/scripts/1/test_dbpedia_14_pairs.json +8 -0
- wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +30 -0
- wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +8 -0
- wisent/examples/scripts/1/test_ethos_binary_evaluation.json +30 -0
- wisent/examples/scripts/1/test_ethos_binary_pairs.json +8 -0
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/2/test_arc_ar_evaluation.json +30 -0
- wisent/examples/scripts/2/test_arc_ar_pairs.json +8 -0
- wisent/examples/scripts/2/test_atis_evaluation.json +30 -0
- wisent/examples/scripts/2/test_atis_pairs.json +8 -0
- wisent/examples/scripts/2/test_babi_evaluation.json +30 -0
- wisent/examples/scripts/2/test_babi_pairs.json +8 -0
- wisent/examples/scripts/2/test_babilong_evaluation.json +30 -0
- wisent/examples/scripts/2/test_babilong_pairs.json +8 -0
- wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +30 -0
- wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +8 -0
- wisent/examples/scripts/2/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/benchmark_tags.json +2140 -0
- wisent/examples/scripts/lm_eval_readme.json +4 -0
- wisent/examples/scripts/results/benchmark_descriptions.json +1244 -0
- wisent/examples/scripts/results/benchmark_evaluation_methods.json +66 -0
- wisent/examples/scripts/results/benchmark_evaluator_mapping.json +2781 -0
- wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +30536 -0
- wisent/examples/scripts/results/benchmark_evaluators_clean.json +469 -0
- wisent/examples/scripts/results/benchmark_methods_summary.json +260 -0
- wisent/examples/scripts/results/benchmark_pair_creation_methods.json +66 -0
- wisent/examples/scripts/results/benchmark_pair_totals.json +269 -0
- wisent/examples/scripts/results/benchmark_tags.json +917 -0
- wisent/examples/scripts/results/benchmark_test_summary_nov4.json +71 -0
- wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +150 -0
- wisent/examples/scripts/results/failing_benchmarks.json +946 -0
- wisent/examples/scripts/results/failing_benchmarks_list.json +41 -0
- wisent/examples/scripts/results/failing_benchmarks_test_results.json +945 -0
- wisent/examples/scripts/results/missing_benchmark_tags.json +341 -0
- wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +30 -0
- wisent/examples/scripts/results/test_20_newsgroups_pairs.json +8 -0
- wisent/examples/scripts/results/test_AraDICE_evaluation.json +51 -0
- wisent/examples/scripts/results/test_AraDICE_pairs.json +14 -0
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +8 -0
- wisent/examples/scripts/results/test_ArabCulture_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ArabCulture_pairs.json +14 -0
- wisent/examples/scripts/results/test_Tag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_Tag_pairs.json +8 -0
- wisent/examples/scripts/results/test_aclue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aclue_pairs.json +14 -0
- wisent/examples/scripts/results/test_acp_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +51 -0
- wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +14 -0
- wisent/examples/scripts/results/test_acp_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +51 -0
- wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +14 -0
- wisent/examples/scripts/results/test_aexams_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aexams_pairs.json +14 -0
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +8 -0
- wisent/examples/scripts/results/test_ag_news_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ag_news_pairs.json +8 -0
- wisent/examples/scripts/results/test_agieval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_agieval_pairs.json +14 -0
- wisent/examples/scripts/results/test_aime2024_evaluation.json +30 -0
- wisent/examples/scripts/results/test_aime2024_pairs.json +8 -0
- wisent/examples/scripts/results/test_aime2025_evaluation.json +30 -0
- wisent/examples/scripts/results/test_aime2025_pairs.json +8 -0
- wisent/examples/scripts/results/test_aime_evaluation.json +30 -0
- wisent/examples/scripts/results/test_aime_pairs.json +8 -0
- wisent/examples/scripts/results/test_anagrams1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_anagrams1_pairs.json +8 -0
- wisent/examples/scripts/results/test_anagrams2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_anagrams2_pairs.json +8 -0
- wisent/examples/scripts/results/test_anli_evaluation.json +30 -0
- wisent/examples/scripts/results/test_anli_pairs.json +8 -0
- wisent/examples/scripts/results/test_apps_evaluation.json +30 -0
- wisent/examples/scripts/results/test_apps_pairs.json +8 -0
- wisent/examples/scripts/results/test_arabic_exams_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arabic_exams_pairs.json +8 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +14 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +14 -0
- wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arabicmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +14 -0
- wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +51 -0
- wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +14 -0
- wisent/examples/scripts/results/test_arc_ar_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arc_ar_pairs.json +8 -0
- wisent/examples/scripts/results/test_arc_challenge_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arc_challenge_pairs.json +8 -0
- wisent/examples/scripts/results/test_arc_easy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_arc_easy_pairs.json +8 -0
- wisent/examples/scripts/results/test_argument_topic_evaluation.json +30 -0
- wisent/examples/scripts/results/test_argument_topic_pairs.json +8 -0
- wisent/examples/scripts/results/test_arithmetic_evaluation.json +51 -0
- wisent/examples/scripts/results/test_arithmetic_pairs.json +14 -0
- wisent/examples/scripts/results/test_asdiv_evaluation.json +30 -0
- wisent/examples/scripts/results/test_asdiv_pairs.json +8 -0
- wisent/examples/scripts/results/test_assin_entailment_evaluation.json +30 -0
- wisent/examples/scripts/results/test_assin_entailment_pairs.json +8 -0
- wisent/examples/scripts/results/test_atis_evaluation.json +30 -0
- wisent/examples/scripts/results/test_atis_pairs.json +8 -0
- wisent/examples/scripts/results/test_babi_evaluation.json +30 -0
- wisent/examples/scripts/results/test_babi_pairs.json +8 -0
- wisent/examples/scripts/results/test_babilong_evaluation.json +30 -0
- wisent/examples/scripts/results/test_babilong_pairs.json +8 -0
- wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +30 -0
- wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +8 -0
- wisent/examples/scripts/results/test_banking77_evaluation.json +30 -0
- wisent/examples/scripts/results/test_banking77_pairs.json +8 -0
- wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque-glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_basqueglue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_basqueglue_pairs.json +14 -0
- wisent/examples/scripts/results/test_bbh_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bbh_pairs.json +14 -0
- wisent/examples/scripts/results/test_bbq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_bbq_pairs.json +8 -0
- wisent/examples/scripts/results/test_bec2016eu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bec2016eu_pairs.json +14 -0
- wisent/examples/scripts/results/test_belebele_evaluation.json +51 -0
- wisent/examples/scripts/results/test_belebele_pairs.json +14 -0
- wisent/examples/scripts/results/test_benchmarks_evaluation.json +51 -0
- wisent/examples/scripts/results/test_benchmarks_pairs.json +14 -0
- wisent/examples/scripts/results/test_bertaqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bertaqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_bhtc_v2_pairs.json +8 -0
- wisent/examples/scripts/results/test_bigbench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_bigbench_pairs.json +14 -0
- wisent/examples/scripts/results/test_blimp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_blimp_pairs.json +14 -0
- wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +8 -0
- wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +8 -0
- wisent/examples/scripts/results/test_boolq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_boolq_pairs.json +8 -0
- wisent/examples/scripts/results/test_c4_evaluation.json +30 -0
- wisent/examples/scripts/results/test_c4_pairs.json +8 -0
- wisent/examples/scripts/results/test_cabreu_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cabreu_pairs.json +8 -0
- wisent/examples/scripts/results/test_careqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_careqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_catalan_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_catalan_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_catalanqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_catalanqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_catcola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_catcola_pairs.json +8 -0
- wisent/examples/scripts/results/test_cb_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cb_pairs.json +8 -0
- wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +14 -0
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +8 -0
- wisent/examples/scripts/results/test_ceval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ceval_pairs.json +14 -0
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +14 -0
- wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +51 -0
- wisent/examples/scripts/results/test_chain_of_thought_pairs.json +14 -0
- wisent/examples/scripts/results/test_chartqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_chartqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +30 -0
- wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +8 -0
- wisent/examples/scripts/results/test_cmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_cmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +8 -0
- wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cocoteros_es_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +8 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +30 -0
- wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +8 -0
- wisent/examples/scripts/results/test_coedit_gec_evaluation.json +30 -0
- wisent/examples/scripts/results/test_coedit_gec_pairs.json +8 -0
- wisent/examples/scripts/results/test_cola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cola_pairs.json +8 -0
- wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_commonsense_qa_pairs.json +8 -0
- wisent/examples/scripts/results/test_conala_evaluation.json +30 -0
- wisent/examples/scripts/results/test_conala_pairs.json +8 -0
- wisent/examples/scripts/results/test_concode_evaluation.json +30 -0
- wisent/examples/scripts/results/test_concode_pairs.json +8 -0
- wisent/examples/scripts/results/test_copa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_copa_pairs.json +8 -0
- wisent/examples/scripts/results/test_copal_id_evaluation.json +30 -0
- wisent/examples/scripts/results/test_copal_id_pairs.json +8 -0
- wisent/examples/scripts/results/test_coqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_coqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_coqcat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_coqcat_pairs.json +8 -0
- wisent/examples/scripts/results/test_crows_pairs_evaluation.json +51 -0
- wisent/examples/scripts/results/test_crows_pairs_pairs.json +14 -0
- wisent/examples/scripts/results/test_csatqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_csatqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_cycle_letters_evaluation.json +30 -0
- wisent/examples/scripts/results/test_cycle_letters_pairs.json +8 -0
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_darija_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_darija_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_darijahellaswag_pairs.json +8 -0
- wisent/examples/scripts/results/test_darijammlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_darijammlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +30 -0
- wisent/examples/scripts/results/test_dbpedia_14_pairs.json +8 -0
- wisent/examples/scripts/results/test_drop_evaluation.json +30 -0
- wisent/examples/scripts/results/test_drop_pairs.json +8 -0
- wisent/examples/scripts/results/test_ds1000_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ds1000_pairs.json +8 -0
- wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_egyhellaswag_pairs.json +8 -0
- wisent/examples/scripts/results/test_egymmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_egymmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +30 -0
- wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +8 -0
- wisent/examples/scripts/results/test_eq_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eq_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_escola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_escola_pairs.json +8 -0
- wisent/examples/scripts/results/test_ethics_cm_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ethics_cm_pairs.json +8 -0
- wisent/examples/scripts/results/test_ethos_binary_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ethos_binary_pairs.json +8 -0
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +51 -0
- wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +14 -0
- wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +51 -0
- wisent/examples/scripts/results/test_eus_exams_es_pairs.json +14 -0
- wisent/examples/scripts/results/test_eus_exams_evaluation.json +51 -0
- wisent/examples/scripts/results/test_eus_exams_pairs.json +14 -0
- wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eus_proficiency_pairs.json +8 -0
- wisent/examples/scripts/results/test_eus_reading_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eus_reading_pairs.json +8 -0
- wisent/examples/scripts/results/test_eus_trivia_evaluation.json +30 -0
- wisent/examples/scripts/results/test_eus_trivia_pairs.json +8 -0
- wisent/examples/scripts/results/test_evalita-mp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita-mp_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
- wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita_LLM_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +8 -0
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +14 -0
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
- wisent/examples/scripts/results/test_fda_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fda_pairs.json +8 -0
- wisent/examples/scripts/results/test_financial_tweets_evaluation.json +30 -0
- wisent/examples/scripts/results/test_financial_tweets_pairs.json +8 -0
- wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fld/test_fld_pairs.json +8 -0
- wisent/examples/scripts/results/test_fld_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +30 -0
- wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +8 -0
- wisent/examples/scripts/results/test_fld_pairs.json +8 -0
- wisent/examples/scripts/results/test_flores_evaluation.json +51 -0
- wisent/examples/scripts/results/test_flores_pairs.json +14 -0
- wisent/examples/scripts/results/test_freebase_evaluation.json +30 -0
- wisent/examples/scripts/results/test_freebase_pairs.json +8 -0
- wisent/examples/scripts/results/test_french_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_french_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_galcola_evaluation.json +30 -0
- wisent/examples/scripts/results/test_galcola_pairs.json +8 -0
- wisent/examples/scripts/results/test_galician_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_galician_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_glianorex_evaluation.json +30 -0
- wisent/examples/scripts/results/test_glianorex_pairs.json +8 -0
- wisent/examples/scripts/results/test_global_mmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_global_mmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_glue_evaluation.json +51 -0
- wisent/examples/scripts/results/test_glue_pairs.json +14 -0
- wisent/examples/scripts/results/test_gpqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_gpqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +51 -0
- wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +14 -0
- wisent/examples/scripts/results/test_groundcocoa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_groundcocoa_pairs.json +8 -0
- wisent/examples/scripts/results/test_gsm8k_evaluation.json +30 -0
- wisent/examples/scripts/results/test_gsm8k_pairs.json +8 -0
- wisent/examples/scripts/results/test_haerae_evaluation.json +51 -0
- wisent/examples/scripts/results/test_haerae_pairs.json +14 -0
- wisent/examples/scripts/results/test_headqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_headqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_hellaswag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_hellaswag_pairs.json +8 -0
- wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +51 -0
- wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +14 -0
- wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +51 -0
- wisent/examples/scripts/results/test_hendrycks_math_pairs.json +14 -0
- wisent/examples/scripts/results/test_histoires_morales_evaluation.json +30 -0
- wisent/examples/scripts/results/test_histoires_morales_pairs.json +8 -0
- wisent/examples/scripts/results/test_hmmt_evaluation.json +30 -0
- wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +30 -0
- wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +8 -0
- wisent/examples/scripts/results/test_hmmt_pairs.json +8 -0
- wisent/examples/scripts/results/test_hrm8k_evaluation.json +51 -0
- wisent/examples/scripts/results/test_hrm8k_pairs.json +14 -0
- wisent/examples/scripts/results/test_humaneval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_humaneval_pairs.json +8 -0
- wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +30 -0
- wisent/examples/scripts/results/test_humaneval_plus_pairs.json +8 -0
- wisent/examples/scripts/results/test_ifeval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ifeval_pairs.json +8 -0
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +8 -0
- wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +30 -0
- wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +8 -0
- wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +51 -0
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +30 -0
- wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +8 -0
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +51 -0
- wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +14 -0
- wisent/examples/scripts/results/test_inverse_scaling_pairs.json +14 -0
- wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +8 -0
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +30 -0
- wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +8 -0
- wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +51 -0
- wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +14 -0
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +8 -0
- wisent/examples/scripts/results/test_kbl_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +14 -0
- wisent/examples/scripts/results/test_kbl_pairs.json +14 -0
- wisent/examples/scripts/results/test_kmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_kobest_evaluation.json +51 -0
- wisent/examples/scripts/results/test_kobest_pairs.json +14 -0
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +30 -0
- wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +8 -0
- wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_kormedmcqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_cloze_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +51 -0
- wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +14 -0
- wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +51 -0
- wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +14 -0
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +14 -0
- wisent/examples/scripts/results/test_lambada_openai_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_openai_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_lambada_standard_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lambada_standard_pairs.json +8 -0
- wisent/examples/scripts/results/test_leaderboard_evaluation.json +51 -0
- wisent/examples/scripts/results/test_leaderboard_pairs.json +14 -0
- wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +51 -0
- wisent/examples/scripts/results/test_libra/test_libra_pairs.json +14 -0
- wisent/examples/scripts/results/test_libra_evaluation.json +51 -0
- wisent/examples/scripts/results/test_libra_pairs.json +14 -0
- wisent/examples/scripts/results/test_lingoly_evaluation.json +30 -0
- wisent/examples/scripts/results/test_lingoly_pairs.json +8 -0
- wisent/examples/scripts/results/test_livecodebench_evaluation.json +30 -0
- wisent/examples/scripts/results/test_livecodebench_pairs.json +8 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +30 -0
- wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +8 -0
- wisent/examples/scripts/results/test_llama_evaluation.json +30 -0
- wisent/examples/scripts/results/test_llama_pairs.json +8 -0
- wisent/examples/scripts/results/test_logiqa2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_logiqa2_pairs.json +8 -0
- wisent/examples/scripts/results/test_logiqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_logiqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_m_mmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_m_mmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +14 -0
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +8 -0
- wisent/examples/scripts/results/test_mastermind_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mastermind_pairs.json +14 -0
- wisent/examples/scripts/results/test_math500_evaluation.json +30 -0
- wisent/examples/scripts/results/test_math500_pairs.json +8 -0
- wisent/examples/scripts/results/test_math_evaluation.json +30 -0
- wisent/examples/scripts/results/test_math_pairs.json +8 -0
- wisent/examples/scripts/results/test_mathqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mathqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_mbpp_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mbpp_pairs.json +8 -0
- wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mbpp_plus_pairs.json +8 -0
- wisent/examples/scripts/results/test_mc_taco_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mc_taco_pairs.json +8 -0
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +14 -0
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +30 -0
- wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +8 -0
- wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +14 -0
- wisent/examples/scripts/results/test_meddialog_evaluation.json +30 -0
- wisent/examples/scripts/results/test_meddialog_pairs.json +8 -0
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +30 -0
- wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +8 -0
- wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +8 -0
- wisent/examples/scripts/results/test_medmcqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_medmcqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_medqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_medqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_medtext_evaluation.json +30 -0
- wisent/examples/scripts/results/test_medtext_pairs.json +8 -0
- wisent/examples/scripts/results/test_mela_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mela_pairs.json +14 -0
- wisent/examples/scripts/results/test_meqsum_evaluation.json +30 -0
- wisent/examples/scripts/results/test_meqsum_pairs.json +8 -0
- wisent/examples/scripts/results/test_mercury_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mercury_pairs.json +8 -0
- wisent/examples/scripts/results/test_metabench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_metabench_pairs.json +14 -0
- wisent/examples/scripts/results/test_mgsm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mgsm_pairs.json +14 -0
- wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mimic_repsum_pairs.json +8 -0
- wisent/examples/scripts/results/test_minerva_math_evaluation.json +51 -0
- wisent/examples/scripts/results/test_minerva_math_pairs.json +14 -0
- wisent/examples/scripts/results/test_mlqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mlqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu_pro_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmlu_prox_pairs.json +14 -0
- wisent/examples/scripts/results/test_mmlusr_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mmlusr_pairs.json +8 -0
- wisent/examples/scripts/results/test_mmmu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_mmmu_pairs.json +14 -0
- wisent/examples/scripts/results/test_mnli_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mnli_pairs.json +8 -0
- wisent/examples/scripts/results/test_model_written_evals_evaluation.json +51 -0
- wisent/examples/scripts/results/test_model_written_evals_pairs.json +14 -0
- wisent/examples/scripts/results/test_moral_stories_evaluation.json +30 -0
- wisent/examples/scripts/results/test_moral_stories_pairs.json +8 -0
- wisent/examples/scripts/results/test_mts_dialog_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mts_dialog_pairs.json +8 -0
- wisent/examples/scripts/results/test_multiblimp_evaluation.json +51 -0
- wisent/examples/scripts/results/test_multiblimp_pairs.json +14 -0
- wisent/examples/scripts/results/test_multimedqa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_multimedqa_pairs.json +14 -0
- wisent/examples/scripts/results/test_multipl_e_evaluation.json +30 -0
- wisent/examples/scripts/results/test_multipl_e_pairs.json +8 -0
- wisent/examples/scripts/results/test_mutual_evaluation.json +30 -0
- wisent/examples/scripts/results/test_mutual_pairs.json +8 -0
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_noreval_evaluation.json +51 -0
- wisent/examples/scripts/results/test_noreval_pairs.json +14 -0
- wisent/examples/scripts/results/test_noticia_evaluation.json +30 -0
- wisent/examples/scripts/results/test_noticia_pairs.json +8 -0
- wisent/examples/scripts/results/test_nq_open_evaluation.json +30 -0
- wisent/examples/scripts/results/test_nq_open_pairs.json +8 -0
- wisent/examples/scripts/results/test_olaph_evaluation.json +30 -0
- wisent/examples/scripts/results/test_olaph_pairs.json +8 -0
- wisent/examples/scripts/results/test_openbookqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_openbookqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_openllm_evaluation.json +51 -0
- wisent/examples/scripts/results/test_openllm_pairs.json +14 -0
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_paloma_evaluation.json +51 -0
- wisent/examples/scripts/results/test_paloma_pairs.json +14 -0
- wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +30 -0
- wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +8 -0
- wisent/examples/scripts/results/test_paws-x_evaluation.json +51 -0
- wisent/examples/scripts/results/test_paws-x_pairs.json +14 -0
- wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_penn_treebank_evaluation.json +30 -0
- wisent/examples/scripts/results/test_penn_treebank_pairs.json +8 -0
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +30 -0
- wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +8 -0
- wisent/examples/scripts/results/test_piqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_piqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_polemo2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polemo2_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_en_high_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +8 -0
- wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +30 -0
- wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +8 -0
- wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_portuguese_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
- wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
- wisent/examples/scripts/results/test_prost_evaluation.json +30 -0
- wisent/examples/scripts/results/test_prost_pairs.json +8 -0
- wisent/examples/scripts/results/test_ptb_evaluation.json +30 -0
- wisent/examples/scripts/results/test_ptb_pairs.json +8 -0
- wisent/examples/scripts/results/test_pubmedqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_pubmedqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_pythia_evaluation.json +51 -0
- wisent/examples/scripts/results/test_pythia_pairs.json +14 -0
- wisent/examples/scripts/results/test_qa4mre_evaluation.json +30 -0
- wisent/examples/scripts/results/test_qa4mre_pairs.json +8 -0
- wisent/examples/scripts/results/test_qasper_evaluation.json +30 -0
- wisent/examples/scripts/results/test_qasper_pairs.json +8 -0
- wisent/examples/scripts/results/test_race_evaluation.json +30 -0
- wisent/examples/scripts/results/test_race_pairs.json +8 -0
- wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +30 -0
- wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +8 -0
- wisent/examples/scripts/results/test_recode_evaluation.json +30 -0
- wisent/examples/scripts/results/test_recode_pairs.json +8 -0
- wisent/examples/scripts/results/test_record_evaluation.json +30 -0
- wisent/examples/scripts/results/test_record_pairs.json +8 -0
- wisent/examples/scripts/results/test_ruler_evaluation.json +51 -0
- wisent/examples/scripts/results/test_ruler_pairs.json +14 -0
- wisent/examples/scripts/results/test_sciq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_sciq_pairs.json +8 -0
- wisent/examples/scripts/results/test_score_evaluation.json +51 -0
- wisent/examples/scripts/results/test_score_pairs.json +14 -0
- wisent/examples/scripts/results/test_self_consistency_evaluation.json +30 -0
- wisent/examples/scripts/results/test_self_consistency_pairs.json +8 -0
- wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_siqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_siqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_spanish_bench_evaluation.json +51 -0
- wisent/examples/scripts/results/test_spanish_bench_pairs.json +14 -0
- wisent/examples/scripts/results/test_squad2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_squad2_pairs.json +8 -0
- wisent/examples/scripts/results/test_squadv2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_squadv2_pairs.json +8 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +30 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +8 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +51 -0
- wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +14 -0
- wisent/examples/scripts/results/test_swag_evaluation.json +30 -0
- wisent/examples/scripts/results/test_swag_pairs.json +8 -0
- wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +51 -0
- wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +14 -0
- wisent/examples/scripts/results/test_tmmluplus_evaluation.json +51 -0
- wisent/examples/scripts/results/test_tmmluplus_pairs.json +14 -0
- wisent/examples/scripts/results/test_translation_evaluation.json +51 -0
- wisent/examples/scripts/results/test_translation_pairs.json +14 -0
- wisent/examples/scripts/results/test_triviaqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_triviaqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +51 -0
- wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +14 -0
- wisent/examples/scripts/results/test_truthfulqa_evaluation.json +30 -0
- wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +30 -0
- wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +8 -0
- wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +30 -0
- wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +8 -0
- wisent/examples/scripts/results/test_truthfulqa_pairs.json +8 -0
- wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +51 -0
- wisent/examples/scripts/results/test_turkishmmlu_pairs.json +14 -0
- wisent/examples/scripts/results/test_unfair_tos_evaluation.json +30 -0
- wisent/examples/scripts/results/test_unfair_tos_pairs.json +8 -0
- wisent/examples/scripts/results/test_unscramble_evaluation.json +51 -0
- wisent/examples/scripts/results/test_unscramble_pairs.json +14 -0
- wisent/examples/scripts/results/test_webqs_evaluation.json +30 -0
- wisent/examples/scripts/results/test_webqs_pairs.json +8 -0
- wisent/examples/scripts/results/test_wikitext103_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wikitext103_pairs.json +8 -0
- wisent/examples/scripts/results/test_wikitext_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wikitext_pairs.json +8 -0
- wisent/examples/scripts/results/test_winogender_evaluation.json +51 -0
- wisent/examples/scripts/results/test_winogender_pairs.json +14 -0
- wisent/examples/scripts/results/test_winogrande_evaluation.json +30 -0
- wisent/examples/scripts/results/test_winogrande_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmdp_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmdp_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +8 -0
- wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +8 -0
- wisent/examples/scripts/results/test_wsc273_evaluation.json +30 -0
- wisent/examples/scripts/results/test_wsc273_pairs.json +8 -0
- wisent/examples/scripts/results/test_xcopa_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xcopa_pairs.json +14 -0
- wisent/examples/scripts/results/test_xnli_eu_evaluation.json +30 -0
- wisent/examples/scripts/results/test_xnli_eu_pairs.json +8 -0
- wisent/examples/scripts/results/test_xnli_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xnli_pairs.json +14 -0
- wisent/examples/scripts/results/test_xquad_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xquad_pairs.json +14 -0
- wisent/examples/scripts/results/test_xstorycloze_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xstorycloze_pairs.json +14 -0
- wisent/examples/scripts/results/test_xsum_evaluation.json +30 -0
- wisent/examples/scripts/results/test_xsum_pairs.json +8 -0
- wisent/examples/scripts/results/test_xwinograd_evaluation.json +51 -0
- wisent/examples/scripts/results/test_xwinograd_pairs.json +14 -0
- wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +30 -0
- wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +8 -0
- wisent/parameters/__init__.py +1 -0
- wisent/parameters/lm_eval/all_lm_eval_task_families.json +169 -0
- wisent/parameters/lm_eval/broken_in_lm_eval.json +10 -0
- wisent/parameters/lm_eval/evaluations_not_lm_eval_tasks.json +0 -0
- wisent/parameters/lm_eval/evaluator_check.json +3476 -0
- wisent/parameters/lm_eval/final_verification.json +24782 -0
- wisent/parameters/lm_eval/group_task_evaluators.json +1833 -0
- wisent/parameters/lm_eval/group_tasks.json +150 -0
- wisent/parameters/lm_eval/individual_tasks.json +402 -0
- wisent/parameters/lm_eval/no_readmes.json +1 -0
- wisent/parameters/lm_eval/not_lm_eval_tasks.json +110 -0
- wisent/parameters/lm_eval/read_tasks.json +208 -0
- wisent/parameters/lm_eval/readme_files.json +208 -0
- wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +128 -0
- wisent/parameters/tasks/missing_task_families.json +2963 -0
- wisent/parameters/tasks/remaining_tasks_to_implement.json +199 -0
- wisent/parameters/tasks/risks.json +10 -0
- wisent/parameters/tasks/skills.json +14 -0
- wisent/parameters/tasks/tasks.json +56031 -0
- wisent/scripts/run_quality_metrics_sweep.sh +315 -0
- wisent/tests/__init__.py +0 -0
- wisent/tests/examples/__init__.py +0 -0
- wisent/tests/examples/cli/__init__.py +0 -0
- wisent/tests/examples/cli/activations/__init__.py +0 -0
- wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
- wisent/tests/examples/cli/classifier/__init__.py +0 -0
- wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
- wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
- wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
- wisent/tests/examples/cli/evaluation/__init__.py +0 -0
- wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
- wisent/tests/examples/cli/generate/__init__.py +0 -0
- wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
- wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
- wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
- wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
- wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
- wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
- wisent/tests/examples/cli/optimizer/__init__.py +0 -0
- wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
- wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
- wisent/tests/examples/cli/steering/__init__.py +0 -0
- wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
- wisent/tests/examples/cli/synthetic/__init__.py +0 -0
- wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
- wisent/tests/nosense/__init__.py +6 -0
- wisent/tests/nosense/base_nosense.py +81 -0
- wisent/tests/nosense/math500_nosense.py +72 -0
- wisent/tests/nosense/test_robustness.py +336 -0
- wisent/tests/test_all_cli_commands.py +674 -0
- wisent/tests/test_geometry_comprehensive.py +327 -0
- wisent/tests/test_titan_geometry.py +257 -0
- wisent/tests/visualize_geometry.py +148 -0
- wisent-0.7.379.dist-info/METADATA +64 -0
- wisent-0.7.379.dist-info/RECORD +1720 -0
- wisent-0.7.379.dist-info/WHEEL +5 -0
- wisent-0.7.379.dist-info/entry_points.txt +2 -0
- wisent-0.7.379.dist-info/licenses/LICENSE +21 -0
- wisent-0.7.379.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1453 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Task Manager for lm-evaluation-harness integration.
|
|
3
|
+
|
|
4
|
+
This module handles discovery, validation, and loading of tasks from the
|
|
5
|
+
lm-evaluation-harness library.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import random
|
|
12
|
+
import yaml
|
|
13
|
+
import tempfile
|
|
14
|
+
import glob
|
|
15
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
16
|
+
from difflib import SequenceMatcher
|
|
17
|
+
|
|
18
|
+
from wisent.core.errors import TaskLoadError, TaskNotFoundError, NoDocsAvailableError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_available_tasks() -> List[str]:
|
|
22
|
+
"""Load available tasks from local tasks.json file or lm-eval registry."""
|
|
23
|
+
|
|
24
|
+
# First try to load from local tasks.json file
|
|
25
|
+
try:
|
|
26
|
+
tasks_json_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "parameters", "tasks", "tasks.json")
|
|
27
|
+
if not os.path.exists(tasks_json_path):
|
|
28
|
+
# Try alternative path
|
|
29
|
+
tasks_json_path = os.path.join(os.path.dirname(__file__), "..", "..", "tasks.json")
|
|
30
|
+
|
|
31
|
+
if os.path.exists(tasks_json_path):
|
|
32
|
+
with open(tasks_json_path, 'r') as f:
|
|
33
|
+
tasks_data = json.load(f)
|
|
34
|
+
if 'task_list' in tasks_data and tasks_data['task_list']:
|
|
35
|
+
print(f"Loaded {len(tasks_data['task_list'])} tasks from local tasks.json")
|
|
36
|
+
return tasks_data['task_list']
|
|
37
|
+
elif 'tasks' in tasks_data:
|
|
38
|
+
task_names = list(tasks_data['tasks'].keys())
|
|
39
|
+
print(f"Loaded {len(task_names)} tasks from local tasks.json")
|
|
40
|
+
return task_names
|
|
41
|
+
except Exception as e:
|
|
42
|
+
print(f"Warning: Could not load from local tasks.json: {e}")
|
|
43
|
+
|
|
44
|
+
# Fallback to dynamic loading from lm-eval
|
|
45
|
+
try:
|
|
46
|
+
# Try to import lm-eval and get tasks from registry
|
|
47
|
+
from lm_eval.api.registry import ALL_TASKS
|
|
48
|
+
return list(ALL_TASKS)
|
|
49
|
+
except ImportError:
|
|
50
|
+
# If lm-eval not available, try subprocess approach
|
|
51
|
+
try:
|
|
52
|
+
import subprocess
|
|
53
|
+
result = subprocess.run(['lm_eval', '--tasks', 'list'],
|
|
54
|
+
capture_output=True, text=True, timeout=30)
|
|
55
|
+
|
|
56
|
+
# Extract task names from the formatted output
|
|
57
|
+
task_names = []
|
|
58
|
+
for line in result.stdout.split('\n'):
|
|
59
|
+
if '|' in line and not line.startswith('|---') and not 'Group' in line and not 'Config Location' in line:
|
|
60
|
+
parts = line.split('|')
|
|
61
|
+
if len(parts) >= 2:
|
|
62
|
+
task_name = parts[1].strip()
|
|
63
|
+
if task_name and not task_name.startswith('-') and task_name != 'Group':
|
|
64
|
+
task_names.append(task_name)
|
|
65
|
+
|
|
66
|
+
return task_names
|
|
67
|
+
except Exception:
|
|
68
|
+
# Final fallback - try to discover from lm_eval module
|
|
69
|
+
try:
|
|
70
|
+
import lm_eval.tasks
|
|
71
|
+
# Get all available task names through introspection
|
|
72
|
+
from lm_eval.tasks import get_task_dict
|
|
73
|
+
# This will fail for invalid tasks, so we need another approach
|
|
74
|
+
|
|
75
|
+
# Try to get task names from lm_eval internals
|
|
76
|
+
try:
|
|
77
|
+
import lm_eval.tasks.openbookqa # Import a known task module to trigger loading
|
|
78
|
+
from lm_eval.api.registry import TASK_REGISTRY
|
|
79
|
+
return list(TASK_REGISTRY.keys())
|
|
80
|
+
except:
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
# Last resort - scan lm_eval.tasks for modules
|
|
84
|
+
import pkgutil
|
|
85
|
+
import lm_eval.tasks as tasks_pkg
|
|
86
|
+
|
|
87
|
+
task_names = []
|
|
88
|
+
for importer, modname, ispkg in pkgutil.iter_modules(tasks_pkg.__path__):
|
|
89
|
+
if not ispkg and not modname.startswith('_'):
|
|
90
|
+
task_names.append(modname)
|
|
91
|
+
|
|
92
|
+
return task_names
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
raise TaskLoadError(
|
|
96
|
+
task_name="lm-eval task discovery",
|
|
97
|
+
cause=e
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def load_docs(task, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
102
|
+
"""
|
|
103
|
+
Load documents from the most appropriate split (validation → test → train → fewshot).
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
task: Task object from lm_eval
|
|
107
|
+
limit: Optional limit on number of documents to load
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
List of documents from the most appropriate split
|
|
111
|
+
"""
|
|
112
|
+
docs = []
|
|
113
|
+
|
|
114
|
+
# Try different doc sources in order of preference
|
|
115
|
+
if task.has_validation_docs():
|
|
116
|
+
docs = list(task.validation_docs())
|
|
117
|
+
elif task.has_test_docs():
|
|
118
|
+
docs = list(task.test_docs())
|
|
119
|
+
elif task.has_training_docs():
|
|
120
|
+
docs = list(task.training_docs())
|
|
121
|
+
elif hasattr(task, 'has_fewshot_docs') and task.has_fewshot_docs():
|
|
122
|
+
docs = list(task.fewshot_docs())
|
|
123
|
+
else:
|
|
124
|
+
# For tasks that use fewshot_split (like MMMLU), try to load from dataset directly
|
|
125
|
+
if hasattr(task, 'dataset') and hasattr(task, 'fewshot_split'):
|
|
126
|
+
try:
|
|
127
|
+
from datasets import load_dataset
|
|
128
|
+
dataset = load_dataset(
|
|
129
|
+
task.dataset_path if hasattr(task, 'dataset_path') else task.dataset_name,
|
|
130
|
+
task.dataset_config_name if hasattr(task, 'dataset_config_name') else None,
|
|
131
|
+
split=task.fewshot_split
|
|
132
|
+
)
|
|
133
|
+
docs = [dict(item) for item in dataset]
|
|
134
|
+
except Exception as e:
|
|
135
|
+
raise NoDocsAvailableError(task_name=task.NAME)
|
|
136
|
+
else:
|
|
137
|
+
raise NoDocsAvailableError(task_name=task.NAME)
|
|
138
|
+
|
|
139
|
+
if limit is not None and limit > 0:
|
|
140
|
+
docs = docs[:limit]
|
|
141
|
+
|
|
142
|
+
return docs
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def find_working_task_from_group(group_dict, max_depth=3, current_depth=0):
|
|
146
|
+
"""
|
|
147
|
+
Recursively search through nested ConfigurableGroup structures to find a working individual task.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
group_dict: Dictionary-like ConfigurableGroup object or regular dict
|
|
151
|
+
max_depth: Maximum recursion depth to prevent infinite loops
|
|
152
|
+
current_depth: Current recursion depth
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Tuple of (task_object, task_name) or (None, None) if no working task found
|
|
156
|
+
"""
|
|
157
|
+
if current_depth >= max_depth:
|
|
158
|
+
return None, None
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
# Try to iterate through the group
|
|
162
|
+
items = group_dict.items() if hasattr(group_dict, 'items') else []
|
|
163
|
+
|
|
164
|
+
for key, value in items:
|
|
165
|
+
# Skip nested ConfigurableGroup objects at first pass
|
|
166
|
+
if hasattr(value, 'items') and 'ConfigurableGroup' in str(type(key)):
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# Check if this looks like an individual task
|
|
170
|
+
if hasattr(value, 'has_validation_docs') or hasattr(value, 'has_test_docs') or hasattr(value, 'has_training_docs'):
|
|
171
|
+
# Try to validate it has documents
|
|
172
|
+
try:
|
|
173
|
+
has_docs = False
|
|
174
|
+
if hasattr(value, 'has_validation_docs') and value.has_validation_docs():
|
|
175
|
+
has_docs = True
|
|
176
|
+
elif hasattr(value, 'has_test_docs') and value.has_test_docs():
|
|
177
|
+
has_docs = True
|
|
178
|
+
elif hasattr(value, 'has_training_docs') and value.has_training_docs():
|
|
179
|
+
has_docs = True
|
|
180
|
+
|
|
181
|
+
if has_docs:
|
|
182
|
+
# Test if we can actually get documents
|
|
183
|
+
if hasattr(value, 'validation_docs') and value.has_validation_docs():
|
|
184
|
+
docs = list(value.validation_docs())
|
|
185
|
+
elif hasattr(value, 'test_docs') and value.has_test_docs():
|
|
186
|
+
docs = list(value.test_docs())
|
|
187
|
+
elif hasattr(value, 'training_docs') and value.has_training_docs():
|
|
188
|
+
docs = list(value.training_docs())
|
|
189
|
+
else:
|
|
190
|
+
docs = []
|
|
191
|
+
|
|
192
|
+
if docs:
|
|
193
|
+
return value, str(key)
|
|
194
|
+
except Exception:
|
|
195
|
+
# This task doesn't work, try next one
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# If no individual tasks worked, try nested groups
|
|
199
|
+
for key, value in items:
|
|
200
|
+
if hasattr(value, 'items') and 'ConfigurableGroup' in str(type(key)):
|
|
201
|
+
result_task, result_name = find_working_task_from_group(value, max_depth, current_depth + 1)
|
|
202
|
+
if result_task is not None:
|
|
203
|
+
return result_task, result_name
|
|
204
|
+
|
|
205
|
+
return None, None
|
|
206
|
+
|
|
207
|
+
except Exception as e:
|
|
208
|
+
print(f"Error exploring group: {e}")
|
|
209
|
+
return None, None
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def handle_configurable_group_task(task_name: str):
|
|
213
|
+
"""
|
|
214
|
+
Consolidated function to handle ConfigurableGroup tasks for both CLI and processing scripts.
|
|
215
|
+
|
|
216
|
+
This function detects when a task is actually a ConfigurableGroup and finds a working
|
|
217
|
+
individual task within it, handling nested groups up to 3 levels deep.
|
|
218
|
+
Even handles tasks with lm-eval dependency issues by finding working alternatives.
|
|
219
|
+
Also supports loading custom YAML task configurations.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
task_name: Name of the potentially problematic group task
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Tuple of (working_task_object, actual_task_name) or raises ValueError if no working task found
|
|
226
|
+
"""
|
|
227
|
+
try:
|
|
228
|
+
from lm_eval.tasks import get_task_dict
|
|
229
|
+
except ImportError as e:
|
|
230
|
+
raise ImportError("lm-evaluation-harness is required. Install with: pip install lm-eval") from e
|
|
231
|
+
|
|
232
|
+
print(f"🔍 Loading task: {task_name}")
|
|
233
|
+
|
|
234
|
+
# First, try to load the task normally from the registry
|
|
235
|
+
try:
|
|
236
|
+
# Initialize TaskManager to ensure registry is populated
|
|
237
|
+
from lm_eval.tasks import TaskManager as LMTaskManager
|
|
238
|
+
task_manager = LMTaskManager()
|
|
239
|
+
task_manager.initialize_tasks()
|
|
240
|
+
|
|
241
|
+
task_dict = get_task_dict([task_name], task_manager=task_manager)
|
|
242
|
+
if task_name in task_dict:
|
|
243
|
+
task = task_dict[task_name]
|
|
244
|
+
print(f" ✅ Found {task_name} in registry")
|
|
245
|
+
return task, task_name
|
|
246
|
+
except Exception as e:
|
|
247
|
+
print(f" ⚠️ Registry loading failed: {e}")
|
|
248
|
+
|
|
249
|
+
# Check if the task exists in the registry but has loading issues
|
|
250
|
+
try:
|
|
251
|
+
from lm_eval.tasks import TaskManager as LMTaskManager
|
|
252
|
+
task_manager = LMTaskManager()
|
|
253
|
+
task_manager.initialize_tasks()
|
|
254
|
+
|
|
255
|
+
# Check in both individual tasks and groups
|
|
256
|
+
all_tasks = getattr(task_manager, 'all_tasks', set())
|
|
257
|
+
all_groups = getattr(task_manager, 'all_groups', set())
|
|
258
|
+
|
|
259
|
+
print(f" 📊 Registry check: {len(all_tasks)} tasks, {len(all_groups)} groups available")
|
|
260
|
+
print(f" 🔍 Is '{task_name}' in groups? {task_name in all_groups}")
|
|
261
|
+
print(f" 🔍 Is '{task_name}' in tasks? {task_name in all_tasks}")
|
|
262
|
+
|
|
263
|
+
if task_name in all_tasks or task_name in all_groups:
|
|
264
|
+
print(f" 🔍 Task {task_name} exists in registry but has loading issues")
|
|
265
|
+
|
|
266
|
+
# For group tasks, try to extract individual working tasks
|
|
267
|
+
if task_name in all_groups:
|
|
268
|
+
print(f" 💡 Found {task_name} as a ConfigurableGroup - extracting individual tasks...")
|
|
269
|
+
result = try_extract_working_tasks_from_group(task_name, task_manager)
|
|
270
|
+
if result:
|
|
271
|
+
return result
|
|
272
|
+
else:
|
|
273
|
+
print(f" 💥 FAILED: Group {task_name} exists but no working tasks found!")
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
# For individual tasks that fail loading, try aggressive search
|
|
277
|
+
print(f" 💡 Found {task_name} as individual task - trying alternatives...")
|
|
278
|
+
return try_find_related_working_task(task_name)
|
|
279
|
+
|
|
280
|
+
# If not found in registry at all, try aggressive search
|
|
281
|
+
print(f" 🔄 Task {task_name} not found in registry, trying alternatives...")
|
|
282
|
+
return try_find_related_working_task(task_name)
|
|
283
|
+
|
|
284
|
+
except Exception as registry_error:
|
|
285
|
+
print(f" ⚠️ Registry check failed: {registry_error}")
|
|
286
|
+
# Still try aggressive search as fallback
|
|
287
|
+
return try_find_related_working_task(task_name)
|
|
288
|
+
|
|
289
|
+
# If not found in registry, look for custom YAML configurations
|
|
290
|
+
print(f" 🔍 Searching for custom YAML configuration for {task_name}")
|
|
291
|
+
|
|
292
|
+
import os
|
|
293
|
+
import glob
|
|
294
|
+
|
|
295
|
+
# For specific custom tasks like flan_held_in, create the YAML files if needed
|
|
296
|
+
if task_name == "flan_held_in":
|
|
297
|
+
yaml_file_path = create_flan_held_in_files()
|
|
298
|
+
if yaml_file_path:
|
|
299
|
+
config_dir = os.path.dirname(yaml_file_path)
|
|
300
|
+
print(f" 🔍 Loading flan_held_in from: {config_dir}")
|
|
301
|
+
|
|
302
|
+
try:
|
|
303
|
+
# Load using the proper config directory approach
|
|
304
|
+
task_dict = load_task_with_config_dir(task_name, config_dir)
|
|
305
|
+
|
|
306
|
+
if task_name in task_dict:
|
|
307
|
+
task = task_dict[task_name]
|
|
308
|
+
print(f" ✅ Successfully loaded {task_name}")
|
|
309
|
+
return task, task_name
|
|
310
|
+
|
|
311
|
+
# If the group task doesn't load directly, try to extract individual tasks
|
|
312
|
+
print(f" 🔍 Extracting individual tasks from group...")
|
|
313
|
+
individual_tasks = extract_individual_tasks_from_yaml(yaml_file_path, task_name)
|
|
314
|
+
if individual_tasks:
|
|
315
|
+
print(f" 📋 Found individual tasks: {individual_tasks[:3]}...")
|
|
316
|
+
|
|
317
|
+
for extracted_task_name in individual_tasks:
|
|
318
|
+
try:
|
|
319
|
+
individual_dict = load_task_with_config_dir(extracted_task_name, config_dir)
|
|
320
|
+
if extracted_task_name in individual_dict:
|
|
321
|
+
task = individual_dict[extracted_task_name]
|
|
322
|
+
print(f" ✅ Successfully loaded individual task: {extracted_task_name}")
|
|
323
|
+
return task, extracted_task_name
|
|
324
|
+
except Exception as e:
|
|
325
|
+
print(f" ❌ Failed to load {extracted_task_name}: {str(e)[:50]}")
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
except Exception as e:
|
|
329
|
+
print(f" ❌ Failed to load flan_held_in: {e}")
|
|
330
|
+
|
|
331
|
+
# Generic approach for other custom tasks
|
|
332
|
+
# Look for existing YAML files in common directories
|
|
333
|
+
yaml_candidates = []
|
|
334
|
+
search_dirs = [
|
|
335
|
+
"wisent/parameters/tasks",
|
|
336
|
+
".",
|
|
337
|
+
"tasks",
|
|
338
|
+
"configs"
|
|
339
|
+
]
|
|
340
|
+
|
|
341
|
+
for search_dir in search_dirs:
|
|
342
|
+
if os.path.exists(search_dir):
|
|
343
|
+
yaml_candidates.extend(glob.glob(os.path.join(search_dir, f"{task_name}.yaml")))
|
|
344
|
+
yaml_candidates.extend(glob.glob(os.path.join(search_dir, f"{task_name}.yml")))
|
|
345
|
+
|
|
346
|
+
# Try loading existing YAML files for the task
|
|
347
|
+
for yaml_file in yaml_candidates:
|
|
348
|
+
if os.path.exists(yaml_file):
|
|
349
|
+
print(f" 🔍 Found YAML file: {yaml_file}")
|
|
350
|
+
config_dir = os.path.dirname(yaml_file)
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
task_dict = load_task_with_config_dir(task_name, config_dir)
|
|
354
|
+
if task_name in task_dict:
|
|
355
|
+
task = task_dict[task_name]
|
|
356
|
+
print(f" ✅ Successfully loaded {task_name}")
|
|
357
|
+
return task, task_name
|
|
358
|
+
|
|
359
|
+
except Exception as e:
|
|
360
|
+
print(f" ❌ Failed to load from {yaml_file}: {str(e)[:100]}")
|
|
361
|
+
|
|
362
|
+
# If still not found, fall back to the original ConfigurableGroup handling logic
|
|
363
|
+
print(f" 🔄 Falling back to ConfigurableGroup handling for {task_name}")
|
|
364
|
+
|
|
365
|
+
# FIRST: Check if task exists in registry (for both individual tasks and groups)
|
|
366
|
+
try:
|
|
367
|
+
from lm_eval.tasks import TaskManager as LMTaskManager
|
|
368
|
+
task_manager = LMTaskManager()
|
|
369
|
+
task_manager.initialize_tasks()
|
|
370
|
+
|
|
371
|
+
# Check in both individual tasks and groups
|
|
372
|
+
all_tasks = getattr(task_manager, 'all_tasks', set())
|
|
373
|
+
all_groups = getattr(task_manager, 'all_groups', set())
|
|
374
|
+
|
|
375
|
+
# Convert to sets if they're lists, then merge
|
|
376
|
+
if isinstance(all_tasks, list):
|
|
377
|
+
all_tasks = set(all_tasks)
|
|
378
|
+
if isinstance(all_groups, list):
|
|
379
|
+
all_groups = set(all_groups)
|
|
380
|
+
|
|
381
|
+
print(f" 📊 Registry check: {len(all_tasks)} tasks, {len(all_groups)} groups available")
|
|
382
|
+
print(f" 🔍 Is '{task_name}' in groups? {task_name in all_groups}")
|
|
383
|
+
print(f" 🔍 Is '{task_name}' in tasks? {task_name in all_tasks}")
|
|
384
|
+
|
|
385
|
+
if task_name in all_tasks or task_name in all_groups:
|
|
386
|
+
print(f" 🔍 Task {task_name} exists in registry but has loading issues")
|
|
387
|
+
|
|
388
|
+
# For group tasks, try to extract individual working tasks
|
|
389
|
+
if task_name in all_groups:
|
|
390
|
+
print(f" 💡 Found {task_name} as a ConfigurableGroup - extracting individual tasks...")
|
|
391
|
+
result = try_extract_working_tasks_from_group(task_name, task_manager)
|
|
392
|
+
if result:
|
|
393
|
+
return result
|
|
394
|
+
else:
|
|
395
|
+
print(f" 💥 FAILED: Group {task_name} exists but no working tasks found!")
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
# For individual tasks that fail loading, try aggressive search
|
|
399
|
+
print(f" 💡 Found {task_name} as individual task - trying alternatives...")
|
|
400
|
+
return try_find_related_working_task(task_name)
|
|
401
|
+
|
|
402
|
+
# If not found in registry at all, try aggressive search
|
|
403
|
+
print(f" 🔄 Task {task_name} not found in registry, trying alternatives...")
|
|
404
|
+
return try_find_related_working_task(task_name)
|
|
405
|
+
|
|
406
|
+
except Exception as registry_error:
|
|
407
|
+
print(f" ⚠️ Registry check failed: {registry_error}")
|
|
408
|
+
# Still try aggressive search as fallback
|
|
409
|
+
return try_find_related_working_task(task_name)
|
|
410
|
+
|
|
411
|
+
try:
|
|
412
|
+
# Original logic for ConfigurableGroup tasks (should not reach here for known groups)
|
|
413
|
+
task_dict = get_task_dict([task_name])
|
|
414
|
+
if task_name not in task_dict:
|
|
415
|
+
# Task doesn't exist, try aggressive search
|
|
416
|
+
return try_find_related_working_task(task_name)
|
|
417
|
+
|
|
418
|
+
task = task_dict[task_name]
|
|
419
|
+
|
|
420
|
+
# Check if it's a ConfigurableGroup by examining the task object
|
|
421
|
+
if hasattr(task, '__dict__') and isinstance(getattr(task, '__dict__', {}), dict):
|
|
422
|
+
task_dict_items = getattr(task, '__dict__', {})
|
|
423
|
+
|
|
424
|
+
# Look for ConfigurableGroup indicators
|
|
425
|
+
if any(isinstance(v, dict) for v in task_dict_items.values()):
|
|
426
|
+
print(f" 🎯 Detected ConfigurableGroup structure in {task_name}")
|
|
427
|
+
|
|
428
|
+
# Try to find a working individual task within the group
|
|
429
|
+
working_task = find_working_task_from_group(task_dict_items)
|
|
430
|
+
if working_task:
|
|
431
|
+
return working_task
|
|
432
|
+
|
|
433
|
+
# If it's not a ConfigurableGroup or we couldn't find working tasks,
|
|
434
|
+
# try to use the task directly but handle potential dependency issues
|
|
435
|
+
try:
|
|
436
|
+
# Test if the task can load documents (quick validation)
|
|
437
|
+
if hasattr(task, 'validation_docs'):
|
|
438
|
+
docs = list(task.validation_docs())
|
|
439
|
+
if docs:
|
|
440
|
+
print(f" ✅ Task {task_name} works directly")
|
|
441
|
+
return task, task_name
|
|
442
|
+
elif hasattr(task, 'test_docs'):
|
|
443
|
+
docs = list(task.test_docs())
|
|
444
|
+
if docs:
|
|
445
|
+
print(f" ✅ Task {task_name} works directly")
|
|
446
|
+
return task, task_name
|
|
447
|
+
elif hasattr(task, 'training_docs'):
|
|
448
|
+
docs = list(task.training_docs())
|
|
449
|
+
if docs:
|
|
450
|
+
print(f" ✅ Task {task_name} works directly")
|
|
451
|
+
return task, task_name
|
|
452
|
+
|
|
453
|
+
except Exception as doc_error:
|
|
454
|
+
print(f" ⚠️ Task {task_name} has document loading issues: {doc_error}")
|
|
455
|
+
|
|
456
|
+
# If there are dependency issues, try to find working alternatives
|
|
457
|
+
return try_find_related_working_task(task_name)
|
|
458
|
+
|
|
459
|
+
# If we get here, the task exists but has no usable documents
|
|
460
|
+
print(f" ⚠️ Task {task_name} has no usable documents")
|
|
461
|
+
return try_find_related_working_task(task_name)
|
|
462
|
+
|
|
463
|
+
except Exception as e:
|
|
464
|
+
print(f" ❌ Error handling {task_name}: {e}")
|
|
465
|
+
# Try aggressive search for alternatives
|
|
466
|
+
return try_find_related_working_task(task_name)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def extract_individual_tasks_from_yaml(yaml_file: str, group_name: str, _visited_files=None) -> List[str]:
|
|
470
|
+
"""
|
|
471
|
+
Extract individual task names from a YAML configuration file.
|
|
472
|
+
This function handles nested groups by recursively resolving group names.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
yaml_file: Path to the YAML file
|
|
476
|
+
group_name: Name of the group we're looking for
|
|
477
|
+
_visited_files: Set of already visited files to prevent infinite recursion
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
List of individual task names found in the YAML
|
|
481
|
+
"""
|
|
482
|
+
try:
|
|
483
|
+
import yaml
|
|
484
|
+
import os
|
|
485
|
+
|
|
486
|
+
# Initialize visited files set to prevent infinite recursion
|
|
487
|
+
if _visited_files is None:
|
|
488
|
+
_visited_files = set()
|
|
489
|
+
|
|
490
|
+
# Check if we've already processed this file
|
|
491
|
+
yaml_path_normalized = os.path.abspath(yaml_file)
|
|
492
|
+
if yaml_path_normalized in _visited_files:
|
|
493
|
+
print(f" 🔄 Cycle detected: {yaml_file} - skipping to prevent infinite recursion")
|
|
494
|
+
return []
|
|
495
|
+
|
|
496
|
+
_visited_files.add(yaml_path_normalized)
|
|
497
|
+
|
|
498
|
+
with open(yaml_file, 'r') as f:
|
|
499
|
+
yaml_content = yaml.safe_load(f)
|
|
500
|
+
|
|
501
|
+
individual_tasks = []
|
|
502
|
+
|
|
503
|
+
def extract_tasks_recursive(obj, depth=0):
|
|
504
|
+
if depth > 5: # Prevent infinite recursion
|
|
505
|
+
return
|
|
506
|
+
|
|
507
|
+
if isinstance(obj, dict):
|
|
508
|
+
# Look for 'task' key which usually contains individual tasks
|
|
509
|
+
if 'task' in obj:
|
|
510
|
+
task_value = obj['task']
|
|
511
|
+
if isinstance(task_value, str):
|
|
512
|
+
# Single task name - could be individual or group
|
|
513
|
+
individual_tasks.append(task_value)
|
|
514
|
+
elif isinstance(task_value, list):
|
|
515
|
+
# List of tasks or nested groups
|
|
516
|
+
for item in task_value:
|
|
517
|
+
extract_tasks_recursive(item, depth + 1)
|
|
518
|
+
elif isinstance(task_value, dict):
|
|
519
|
+
# Nested task definition
|
|
520
|
+
extract_tasks_recursive(task_value, depth + 1)
|
|
521
|
+
|
|
522
|
+
# Also check other keys recursively
|
|
523
|
+
for key, value in obj.items():
|
|
524
|
+
if key != 'task': # Already processed above
|
|
525
|
+
extract_tasks_recursive(value, depth + 1)
|
|
526
|
+
|
|
527
|
+
elif isinstance(obj, list):
|
|
528
|
+
for item in obj:
|
|
529
|
+
extract_tasks_recursive(item, depth + 1)
|
|
530
|
+
elif isinstance(obj, str):
|
|
531
|
+
# This is a task name (could be individual or group)
|
|
532
|
+
individual_tasks.append(obj)
|
|
533
|
+
|
|
534
|
+
extract_tasks_recursive(yaml_content)
|
|
535
|
+
|
|
536
|
+
# Remove duplicates and filter out empty strings
|
|
537
|
+
potential_tasks = list(set([task for task in individual_tasks if task and isinstance(task, str)]))
|
|
538
|
+
|
|
539
|
+
print(f" 📋 Found potential tasks/groups: {potential_tasks[:5]}...") # Limit output
|
|
540
|
+
|
|
541
|
+
# Now we need to resolve any groups to their individual tasks
|
|
542
|
+
resolved_tasks = []
|
|
543
|
+
|
|
544
|
+
# Get the base directory for this YAML file to find related group files
|
|
545
|
+
yaml_dir = os.path.dirname(yaml_file)
|
|
546
|
+
|
|
547
|
+
# Limit to prevent excessive processing
|
|
548
|
+
max_tasks_to_process = 5
|
|
549
|
+
|
|
550
|
+
for i, task_name in enumerate(potential_tasks[:max_tasks_to_process]):
|
|
551
|
+
# First check if this looks like an individual task (has specific suffixes)
|
|
552
|
+
if any(suffix in task_name for suffix in ['_zeroshot_', '_fewshot_', '_cot_', '_prompt-', '_task_']):
|
|
553
|
+
# This is likely an individual task
|
|
554
|
+
resolved_tasks.append(task_name)
|
|
555
|
+
continue
|
|
556
|
+
|
|
557
|
+
# Check if this is a known group that we should resolve (limit recursion depth)
|
|
558
|
+
if len(_visited_files) < 3: # Limit recursion depth
|
|
559
|
+
potential_group_file = os.path.join(yaml_dir, f"{task_name}.yaml")
|
|
560
|
+
if os.path.exists(potential_group_file):
|
|
561
|
+
print(f" 🔍 Found nested group file: {os.path.basename(potential_group_file)}")
|
|
562
|
+
# Recursively extract from this group
|
|
563
|
+
nested_tasks = extract_individual_tasks_from_yaml(potential_group_file, task_name, _visited_files.copy())
|
|
564
|
+
resolved_tasks.extend(nested_tasks[:3]) # Limit results
|
|
565
|
+
continue
|
|
566
|
+
|
|
567
|
+
# Check in subdirectories (common pattern)
|
|
568
|
+
for subdir in ['zeroshot', 'fewshot', 'cot']:
|
|
569
|
+
subdir_path = os.path.join(yaml_dir, task_name, subdir)
|
|
570
|
+
if os.path.isdir(subdir_path):
|
|
571
|
+
subdir_yaml = os.path.join(subdir_path, f"_{task_name}_{subdir}.yaml")
|
|
572
|
+
if os.path.exists(subdir_yaml):
|
|
573
|
+
print(f" 🔍 Found nested group in subdir: {subdir}")
|
|
574
|
+
nested_tasks = extract_individual_tasks_from_yaml(subdir_yaml, f"{task_name}_{subdir}", _visited_files.copy())
|
|
575
|
+
resolved_tasks.extend(nested_tasks[:3]) # Limit results
|
|
576
|
+
break
|
|
577
|
+
else:
|
|
578
|
+
# Treat as individual task if we can't find a group file
|
|
579
|
+
resolved_tasks.append(task_name)
|
|
580
|
+
else:
|
|
581
|
+
# Max recursion depth reached, treat as individual task
|
|
582
|
+
resolved_tasks.append(task_name)
|
|
583
|
+
|
|
584
|
+
# Final cleanup - remove duplicates and limit results
|
|
585
|
+
final_tasks = list(set(resolved_tasks))[:10] # Limit to 10 tasks max
|
|
586
|
+
|
|
587
|
+
print(f" 📋 Extracted individual tasks from YAML: {final_tasks}")
|
|
588
|
+
return final_tasks
|
|
589
|
+
|
|
590
|
+
except Exception as e:
|
|
591
|
+
print(f" ❌ Error extracting tasks from YAML {yaml_file}: {e}")
|
|
592
|
+
return []
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def try_find_related_working_task(task_name: str):
|
|
596
|
+
"""
|
|
597
|
+
AGGRESSIVELY find related tasks that work when the main task has issues.
|
|
598
|
+
This function will try EVERY possible variation to find a working task.
|
|
599
|
+
NO TASK SHOULD BE SKIPPED!
|
|
600
|
+
|
|
601
|
+
Args:
|
|
602
|
+
task_name: The problematic task name
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
Tuple of (task_object, task_name) or None if absolutely no alternatives found
|
|
606
|
+
"""
|
|
607
|
+
try:
|
|
608
|
+
from lm_eval.tasks import get_task_dict
|
|
609
|
+
from lm_eval.tasks import TaskManager as LMTaskManager
|
|
610
|
+
|
|
611
|
+
# Ensure TaskManager is properly initialized
|
|
612
|
+
task_manager = LMTaskManager()
|
|
613
|
+
task_manager.initialize_tasks()
|
|
614
|
+
|
|
615
|
+
# Get all available tasks from the initialized manager
|
|
616
|
+
all_tasks = getattr(task_manager, 'all_tasks', set())
|
|
617
|
+
all_groups = getattr(task_manager, 'all_groups', set())
|
|
618
|
+
|
|
619
|
+
# Convert to sets if they're lists, then merge
|
|
620
|
+
if isinstance(all_tasks, list):
|
|
621
|
+
all_tasks = set(all_tasks)
|
|
622
|
+
if isinstance(all_groups, list):
|
|
623
|
+
all_groups = set(all_groups)
|
|
624
|
+
|
|
625
|
+
all_available_tasks = all_tasks | all_groups
|
|
626
|
+
|
|
627
|
+
print(f" 📊 TaskManager has {len(all_tasks)} tasks, {len(all_groups)} groups")
|
|
628
|
+
|
|
629
|
+
print(f" 🔄 AGGRESSIVE SEARCH for working alternatives to '{task_name}' ({len(all_available_tasks)} tasks available)...")
|
|
630
|
+
|
|
631
|
+
# Strategy 1: Remove '_group' suffix
|
|
632
|
+
if '_group' in task_name:
|
|
633
|
+
base_name = task_name.replace('_group', '')
|
|
634
|
+
print(f" 🎯 Trying base name: {base_name}")
|
|
635
|
+
try:
|
|
636
|
+
return handle_configurable_group_task(base_name)
|
|
637
|
+
except:
|
|
638
|
+
pass
|
|
639
|
+
|
|
640
|
+
# Strategy 2: Try progressively shorter prefixes
|
|
641
|
+
parts = task_name.split('_')
|
|
642
|
+
if len(parts) > 1:
|
|
643
|
+
for i in range(len(parts) - 1, 0, -1):
|
|
644
|
+
parent_name = '_'.join(parts[:i])
|
|
645
|
+
print(f" 🎯 Trying parent: {parent_name}")
|
|
646
|
+
try:
|
|
647
|
+
return handle_configurable_group_task(parent_name)
|
|
648
|
+
except:
|
|
649
|
+
continue
|
|
650
|
+
|
|
651
|
+
# Strategy 3: Find ANY task with the same prefix (e.g., flan_held_in -> any flan_* task)
|
|
652
|
+
prefix = parts[0] if parts else task_name
|
|
653
|
+
print(f" 🎯 Searching for ANY task starting with '{prefix}_'...")
|
|
654
|
+
|
|
655
|
+
matching_tasks = [t for t in all_available_tasks if t.startswith(prefix + '_') and t != task_name]
|
|
656
|
+
|
|
657
|
+
# Try up to 10 matching tasks until we find one that works
|
|
658
|
+
for candidate in matching_tasks[:10]:
|
|
659
|
+
print(f" 🎯 Trying candidate: {candidate}")
|
|
660
|
+
try:
|
|
661
|
+
result = handle_configurable_group_task(candidate)
|
|
662
|
+
print(f" ✅ SUCCESS! Found working alternative: {candidate}")
|
|
663
|
+
return result
|
|
664
|
+
except:
|
|
665
|
+
continue
|
|
666
|
+
|
|
667
|
+
# Strategy 4: Try exact prefix match (e.g., flan_held_in -> flan)
|
|
668
|
+
if prefix in all_available_tasks:
|
|
669
|
+
print(f" 🎯 Trying exact prefix: {prefix}")
|
|
670
|
+
try:
|
|
671
|
+
return handle_configurable_group_task(prefix)
|
|
672
|
+
except:
|
|
673
|
+
pass
|
|
674
|
+
|
|
675
|
+
# Strategy 5: Find tasks with similar keywords
|
|
676
|
+
keywords = [part for part in parts if len(part) > 2] # Skip short parts
|
|
677
|
+
for keyword in keywords:
|
|
678
|
+
print(f" 🎯 Searching for tasks containing '{keyword}'...")
|
|
679
|
+
keyword_tasks = [t for t in all_available_tasks if keyword in t and t != task_name]
|
|
680
|
+
|
|
681
|
+
for candidate in keyword_tasks[:5]: # Try up to 5 per keyword
|
|
682
|
+
print(f" 🎯 Trying keyword match: {candidate}")
|
|
683
|
+
try:
|
|
684
|
+
result = handle_configurable_group_task(candidate)
|
|
685
|
+
print(f" ✅ SUCCESS! Found working keyword match: {candidate}")
|
|
686
|
+
return result
|
|
687
|
+
except:
|
|
688
|
+
continue
|
|
689
|
+
|
|
690
|
+
# NO MORE STUPID FALLBACKS - FIX THE REAL ISSUE
|
|
691
|
+
print(f" 💥 FAILED TO FIND CORRECT TASK: {task_name} - NO RANDOM FALLBACKS ALLOWED!")
|
|
692
|
+
return None
|
|
693
|
+
|
|
694
|
+
except Exception as e:
|
|
695
|
+
print(f" ❌ Search failed: {e}")
|
|
696
|
+
return None
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def try_extract_working_tasks_from_group(group_name: str, task_manager):
|
|
700
|
+
"""
|
|
701
|
+
Try to extract and load individual working tasks from a problematic group.
|
|
702
|
+
|
|
703
|
+
This handles cases like flan_held_in where the group exists in the registry
|
|
704
|
+
but has loading issues (like yaml_path becoming None during include processing).
|
|
705
|
+
|
|
706
|
+
Args:
|
|
707
|
+
group_name: Name of the group (e.g., 'flan_held_in')
|
|
708
|
+
task_manager: Initialized LM TaskManager instance
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
Tuple of (task_object, task_name) or None if no working tasks found
|
|
712
|
+
"""
|
|
713
|
+
try:
|
|
714
|
+
from lm_eval.tasks import get_task_dict
|
|
715
|
+
|
|
716
|
+
print(f" 🔍 Extracting working tasks from group: {group_name}")
|
|
717
|
+
|
|
718
|
+
# Get the group configuration from the task manager
|
|
719
|
+
if hasattr(task_manager, 'task_index') and group_name in task_manager.task_index:
|
|
720
|
+
group_info = task_manager.task_index[group_name]
|
|
721
|
+
yaml_path = group_info.get('yaml_path')
|
|
722
|
+
|
|
723
|
+
if yaml_path and os.path.exists(yaml_path):
|
|
724
|
+
print(f" 📁 Found group YAML: {yaml_path}")
|
|
725
|
+
|
|
726
|
+
# Generic approach: parse the main YAML to extract task names
|
|
727
|
+
|
|
728
|
+
# STEP 1: Try to parse the main group YAML for task names
|
|
729
|
+
import yaml
|
|
730
|
+
try:
|
|
731
|
+
with open(yaml_path, 'r') as f:
|
|
732
|
+
yaml_content = yaml.safe_load(f)
|
|
733
|
+
|
|
734
|
+
# Extract task names from the main group YAML - more comprehensive search
|
|
735
|
+
initial_tasks = []
|
|
736
|
+
if isinstance(yaml_content, dict):
|
|
737
|
+
# Method 1: Direct 'task' field
|
|
738
|
+
if 'task' in yaml_content:
|
|
739
|
+
if isinstance(yaml_content['task'], list):
|
|
740
|
+
initial_tasks.extend(yaml_content['task'])
|
|
741
|
+
elif isinstance(yaml_content['task'], str):
|
|
742
|
+
initial_tasks.append(yaml_content['task'])
|
|
743
|
+
|
|
744
|
+
# Method 2: Look for any list that might contain task names
|
|
745
|
+
for key, value in yaml_content.items():
|
|
746
|
+
if isinstance(value, list) and key not in ['metric_list', 'generation_kwargs', 'metadata']:
|
|
747
|
+
# Filter for task-like names (avoid metrics and config values)
|
|
748
|
+
for item in value:
|
|
749
|
+
if isinstance(item, str) and ('_' in item or item.isalpha()):
|
|
750
|
+
if item not in initial_tasks:
|
|
751
|
+
initial_tasks.append(item)
|
|
752
|
+
|
|
753
|
+
if initial_tasks:
|
|
754
|
+
print(f" 📋 Found {len(initial_tasks)} initial tasks from main YAML: {initial_tasks[:5]}...")
|
|
755
|
+
|
|
756
|
+
# Try the initially found tasks directly
|
|
757
|
+
for task_name in initial_tasks[:15]: # Try more tasks
|
|
758
|
+
try:
|
|
759
|
+
print(f" 🎯 Trying initial task: {task_name}")
|
|
760
|
+
result = get_task_dict([task_name], task_manager=task_manager)
|
|
761
|
+
if task_name in result:
|
|
762
|
+
task = result[task_name]
|
|
763
|
+
print(f" ✅ SUCCESS: Found working initial task {task_name}")
|
|
764
|
+
return task, task_name
|
|
765
|
+
except Exception as e:
|
|
766
|
+
print(f" ❌ Initial task {task_name} failed: {str(e)[:50]}")
|
|
767
|
+
continue
|
|
768
|
+
else:
|
|
769
|
+
print(f" ⚠️ No task names found in main YAML structure")
|
|
770
|
+
|
|
771
|
+
except Exception as yaml_parse_error:
|
|
772
|
+
print(f" ⚠️ Main YAML parsing failed: {str(yaml_parse_error)[:100]}")
|
|
773
|
+
|
|
774
|
+
# Fallback: try the recursive extraction method
|
|
775
|
+
try:
|
|
776
|
+
individual_tasks = extract_individual_tasks_from_yaml(yaml_path, group_name)
|
|
777
|
+
|
|
778
|
+
if individual_tasks:
|
|
779
|
+
print(f" 📋 Found {len(individual_tasks)} individual tasks in group")
|
|
780
|
+
|
|
781
|
+
# Try to load known working base tasks that these might be based on
|
|
782
|
+
base_tasks_to_try = []
|
|
783
|
+
|
|
784
|
+
# Extract base task names (remove prompt suffixes)
|
|
785
|
+
for task in individual_tasks:
|
|
786
|
+
if '_prompt-' in task:
|
|
787
|
+
base_task = task.split('_prompt-')[0]
|
|
788
|
+
if base_task not in base_tasks_to_try:
|
|
789
|
+
base_tasks_to_try.append(base_task)
|
|
790
|
+
|
|
791
|
+
# Try the base tasks first
|
|
792
|
+
for base_task in base_tasks_to_try:
|
|
793
|
+
try:
|
|
794
|
+
print(f" 🎯 Trying base task: {base_task}")
|
|
795
|
+
result = get_task_dict([base_task], task_manager=task_manager)
|
|
796
|
+
if base_task in result:
|
|
797
|
+
task = result[base_task]
|
|
798
|
+
print(f" ✅ SUCCESS: Found working base task {base_task}")
|
|
799
|
+
return task, base_task
|
|
800
|
+
except Exception as e:
|
|
801
|
+
print(f" ❌ Base task {base_task} failed: {str(e)[:50]}")
|
|
802
|
+
continue
|
|
803
|
+
|
|
804
|
+
# If base tasks don't work, try some individual tasks (but skip templates/variables)
|
|
805
|
+
valid_tasks = [t for t in individual_tasks if not any(x in t for x in ['{{', '}}', '_common_yaml', 'sentence:'])]
|
|
806
|
+
for individual_task in valid_tasks[:5]: # Try first 5 valid ones
|
|
807
|
+
try:
|
|
808
|
+
print(f" 🎯 Trying individual task: {individual_task}")
|
|
809
|
+
result = get_task_dict([individual_task], task_manager=task_manager)
|
|
810
|
+
if individual_task in result:
|
|
811
|
+
task = result[individual_task]
|
|
812
|
+
print(f" ✅ SUCCESS: Found working individual task {individual_task}")
|
|
813
|
+
return task, individual_task
|
|
814
|
+
except Exception as e:
|
|
815
|
+
print(f" ❌ Individual task {individual_task} failed: {str(e)[:50]}")
|
|
816
|
+
continue
|
|
817
|
+
|
|
818
|
+
except Exception as yaml_error:
|
|
819
|
+
print(f" ⚠️ YAML extraction failed (likely !function constructor): {str(yaml_error)[:100]}")
|
|
820
|
+
# Fall through to generic catch-all approach below
|
|
821
|
+
|
|
822
|
+
# FINAL GENERIC CATCH-ALL: If all YAML approaches fail, search registry intelligently
|
|
823
|
+
print(f" 🔍 FINAL CATCH-ALL: Searching registry for tasks matching group pattern...")
|
|
824
|
+
|
|
825
|
+
# Search for tasks that contain the group name or parts of it
|
|
826
|
+
all_tasks = getattr(task_manager, 'all_tasks', set())
|
|
827
|
+
if isinstance(all_tasks, list):
|
|
828
|
+
all_tasks = set(all_tasks)
|
|
829
|
+
|
|
830
|
+
# Generate candidate task names based on the group name with smart filtering
|
|
831
|
+
candidates = []
|
|
832
|
+
|
|
833
|
+
# Strategy 1: Try exact group name
|
|
834
|
+
if group_name in all_tasks:
|
|
835
|
+
candidates.append(group_name)
|
|
836
|
+
|
|
837
|
+
# Strategy 2: Try tasks that start with the group name
|
|
838
|
+
group_prefix_tasks = [t for t in all_tasks if t.startswith(group_name + '_')]
|
|
839
|
+
candidates.extend(group_prefix_tasks[:10]) # Limit to first 10
|
|
840
|
+
|
|
841
|
+
# Strategy 3: Try tasks that contain all major parts of the group name
|
|
842
|
+
group_parts = [part for part in group_name.split('_') if len(part) > 2]
|
|
843
|
+
for part in group_parts:
|
|
844
|
+
matching_tasks = [t for t in all_tasks if part in t and t not in candidates]
|
|
845
|
+
# Prioritize exact matches and longer names
|
|
846
|
+
matching_tasks.sort(key=lambda x: (part in x.split('_'), len(x)), reverse=True)
|
|
847
|
+
candidates.extend(matching_tasks[:3]) # Top 3 per part
|
|
848
|
+
|
|
849
|
+
# Remove duplicates while preserving order
|
|
850
|
+
seen = set()
|
|
851
|
+
unique_candidates = []
|
|
852
|
+
for candidate in candidates:
|
|
853
|
+
if candidate not in seen:
|
|
854
|
+
unique_candidates.append(candidate)
|
|
855
|
+
seen.add(candidate)
|
|
856
|
+
|
|
857
|
+
print(f" 📋 Found {len(unique_candidates)} candidate tasks to try...")
|
|
858
|
+
|
|
859
|
+
# Try each candidate with intelligent prioritization
|
|
860
|
+
for candidate in unique_candidates[:20]: # Limit total attempts
|
|
861
|
+
try:
|
|
862
|
+
print(f" 🎯 Trying candidate: {candidate}")
|
|
863
|
+
result = get_task_dict([candidate], task_manager=task_manager)
|
|
864
|
+
if candidate in result:
|
|
865
|
+
task = result[candidate]
|
|
866
|
+
print(f" ✅ SUCCESS: Found working candidate {candidate}")
|
|
867
|
+
return task, candidate
|
|
868
|
+
except Exception as e:
|
|
869
|
+
print(f" ❌ Candidate {candidate} failed: {str(e)[:50]}")
|
|
870
|
+
continue
|
|
871
|
+
|
|
872
|
+
# If still no success, this group truly has no working tasks
|
|
873
|
+
print(f" 💥 FAILED: Group {group_name} has no working tasks - exhausted all generic approaches")
|
|
874
|
+
print(f" ❌ No working tasks found in group {group_name}")
|
|
875
|
+
return None
|
|
876
|
+
|
|
877
|
+
except Exception as e:
|
|
878
|
+
print(f" ❌ Group extraction failed: {e}")
|
|
879
|
+
return None
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def save_custom_task_yaml(task_name: str, yaml_content: str) -> Optional[str]:
|
|
883
|
+
"""
|
|
884
|
+
Save custom YAML task configuration to the tasks directory for future loading.
|
|
885
|
+
|
|
886
|
+
Args:
|
|
887
|
+
task_name: Name of the task
|
|
888
|
+
yaml_content: YAML content to save
|
|
889
|
+
|
|
890
|
+
Returns:
|
|
891
|
+
Path to the saved file, or None if failed
|
|
892
|
+
"""
|
|
893
|
+
try:
|
|
894
|
+
# Create the tasks directory if it doesn't exist
|
|
895
|
+
tasks_dir = os.path.join("wisent", "parameters", "tasks")
|
|
896
|
+
os.makedirs(tasks_dir, exist_ok=True)
|
|
897
|
+
|
|
898
|
+
# Save the YAML content to a file
|
|
899
|
+
yaml_file_path = os.path.join(tasks_dir, f"{task_name}.yaml")
|
|
900
|
+
with open(yaml_file_path, 'w') as f:
|
|
901
|
+
f.write(yaml_content)
|
|
902
|
+
|
|
903
|
+
print(f" 💾 Saved custom task configuration to: {yaml_file_path}")
|
|
904
|
+
return yaml_file_path
|
|
905
|
+
|
|
906
|
+
except Exception as e:
|
|
907
|
+
print(f" ❌ Failed to save custom task configuration: {e}")
|
|
908
|
+
return None
|
|
909
|
+
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def create_task_yaml_from_user_content(task_name: str, user_yaml_content: str) -> Optional[str]:
|
|
915
|
+
"""
|
|
916
|
+
Create a task YAML file from user-provided YAML content.
|
|
917
|
+
This function can be called when users provide their own YAML configurations.
|
|
918
|
+
|
|
919
|
+
Args:
|
|
920
|
+
task_name: Name of the task
|
|
921
|
+
user_yaml_content: YAML content provided by the user
|
|
922
|
+
|
|
923
|
+
Returns:
|
|
924
|
+
Path to the saved file, or None if failed
|
|
925
|
+
"""
|
|
926
|
+
try:
|
|
927
|
+
# Validate that the YAML is parseable
|
|
928
|
+
yaml_data = yaml.safe_load(user_yaml_content)
|
|
929
|
+
|
|
930
|
+
# Save the user's YAML content
|
|
931
|
+
yaml_file_path = save_custom_task_yaml(f"{task_name}_user", user_yaml_content)
|
|
932
|
+
|
|
933
|
+
if yaml_file_path:
|
|
934
|
+
print(f" 💾 Saved user-provided YAML for {task_name}")
|
|
935
|
+
return yaml_file_path
|
|
936
|
+
|
|
937
|
+
return None
|
|
938
|
+
|
|
939
|
+
except Exception as e:
|
|
940
|
+
print(f" ❌ Failed to process user YAML content: {e}")
|
|
941
|
+
return None
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def load_with_env_config(task_name: str, yaml_file: str):
|
|
945
|
+
"""
|
|
946
|
+
Try to load a task by setting environment variables for lm_eval configuration.
|
|
947
|
+
|
|
948
|
+
Args:
|
|
949
|
+
task_name: Name of the task to load
|
|
950
|
+
yaml_file: Path to the YAML configuration file
|
|
951
|
+
|
|
952
|
+
Returns:
|
|
953
|
+
Task dictionary from get_task_dict
|
|
954
|
+
"""
|
|
955
|
+
try:
|
|
956
|
+
from lm_eval.tasks import get_task_dict
|
|
957
|
+
|
|
958
|
+
# Try setting various environment variables that lm_eval might use
|
|
959
|
+
original_env = {}
|
|
960
|
+
env_vars_to_set = [
|
|
961
|
+
'LM_EVAL_CONFIG_PATH',
|
|
962
|
+
'LM_EVAL_TASKS_PATH',
|
|
963
|
+
'LMEVAL_CONFIG_PATH',
|
|
964
|
+
'TASK_CONFIG_PATH'
|
|
965
|
+
]
|
|
966
|
+
|
|
967
|
+
# Save original environment
|
|
968
|
+
for env_var in env_vars_to_set:
|
|
969
|
+
original_env[env_var] = os.environ.get(env_var)
|
|
970
|
+
os.environ[env_var] = yaml_file
|
|
971
|
+
|
|
972
|
+
try:
|
|
973
|
+
# Try to load the task with environment variables set
|
|
974
|
+
return get_task_dict([task_name])
|
|
975
|
+
finally:
|
|
976
|
+
# Restore original environment
|
|
977
|
+
for env_var in env_vars_to_set:
|
|
978
|
+
if original_env[env_var] is None:
|
|
979
|
+
os.environ.pop(env_var, None)
|
|
980
|
+
else:
|
|
981
|
+
os.environ[env_var] = original_env[env_var]
|
|
982
|
+
|
|
983
|
+
except Exception as e:
|
|
984
|
+
raise Exception(f"Environment config loading failed: {e}")
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def create_flan_held_in_files() -> Optional[str]:
|
|
988
|
+
"""
|
|
989
|
+
Create the actual flan_held_in YAML files as provided by the user.
|
|
990
|
+
This creates both the main file and the template file with proper include directives.
|
|
991
|
+
|
|
992
|
+
Returns:
|
|
993
|
+
Path to the main flan_held_in.yaml file, or None if failed
|
|
994
|
+
"""
|
|
995
|
+
try:
|
|
996
|
+
# Create the tasks directory
|
|
997
|
+
tasks_dir = os.path.join("wisent", "parameters", "tasks")
|
|
998
|
+
os.makedirs(tasks_dir, exist_ok=True)
|
|
999
|
+
|
|
1000
|
+
# Create the template file first
|
|
1001
|
+
template_content = """output_type: generate_until
|
|
1002
|
+
test_split: null
|
|
1003
|
+
doc_to_choice: null
|
|
1004
|
+
metric_list:
|
|
1005
|
+
- metric: exact_match
|
|
1006
|
+
aggregation: mean
|
|
1007
|
+
higher_is_better: true
|
|
1008
|
+
generation_kwargs:
|
|
1009
|
+
until:
|
|
1010
|
+
- "</s>"
|
|
1011
|
+
do_sample: false
|
|
1012
|
+
temperature: 0.0
|
|
1013
|
+
metadata:
|
|
1014
|
+
version: 1.0
|
|
1015
|
+
"""
|
|
1016
|
+
|
|
1017
|
+
template_path = os.path.join(tasks_dir, "_held_in_template_yaml.yaml")
|
|
1018
|
+
with open(template_path, 'w') as f:
|
|
1019
|
+
f.write(template_content)
|
|
1020
|
+
|
|
1021
|
+
# Create the main flan_held_in.yaml file with the exact content from the user
|
|
1022
|
+
main_content = """group: flan_held_in
|
|
1023
|
+
group_alias: Flan (Held-In)
|
|
1024
|
+
task:
|
|
1025
|
+
# ANLI R1
|
|
1026
|
+
- group: anli_r1_flan
|
|
1027
|
+
group_alias: ANLI R1
|
|
1028
|
+
aggregate_metric_list:
|
|
1029
|
+
- metric: acc
|
|
1030
|
+
weight_by_size: True
|
|
1031
|
+
task:
|
|
1032
|
+
- task: anli_r1_prompt-0
|
|
1033
|
+
task_alias: prompt-0
|
|
1034
|
+
include: _held_in_template_yaml
|
|
1035
|
+
doc_to_text: "{{premise}}\\n\\nChoose your answer: based on the paragraph above can we conclude that \\"{{hypothesis}}\\"?\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No\\nI think the answer is"
|
|
1036
|
+
doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
|
|
1037
|
+
- task: anli_r1_prompt-1
|
|
1038
|
+
task_alias: prompt-1
|
|
1039
|
+
include: _held_in_template_yaml
|
|
1040
|
+
doc_to_text: "{{premise}}\\n\\nBased on that paragraph can we conclude that this sentence is true?\\n{{hypothesis}}\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No"
|
|
1041
|
+
doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
|
|
1042
|
+
- task: anli_r1_prompt-2
|
|
1043
|
+
task_alias: prompt-2
|
|
1044
|
+
include: _held_in_template_yaml
|
|
1045
|
+
doc_to_text: "{{premise}}\\n\\nCan we draw the following conclusion?\\n{{hypothesis}}\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No"
|
|
1046
|
+
doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
|
|
1047
|
+
# Arc Easy
|
|
1048
|
+
- group: arc_easy_flan
|
|
1049
|
+
group_alias: Arc Easy
|
|
1050
|
+
aggregate_metric_list:
|
|
1051
|
+
- metric: acc
|
|
1052
|
+
weight_by_size: True
|
|
1053
|
+
task:
|
|
1054
|
+
- task: arc_easy_prompt-0
|
|
1055
|
+
task_alias: prompt-0
|
|
1056
|
+
include: _held_in_template_yaml
|
|
1057
|
+
doc_to_text: "{{question}}\\n\\nOPTIONS:\\n- {{choices.text|join('\\n- ')}}"
|
|
1058
|
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
|
1059
|
+
- task: arc_easy_prompt-1
|
|
1060
|
+
task_alias: prompt-1
|
|
1061
|
+
include: _held_in_template_yaml
|
|
1062
|
+
doc_to_text: "Question: {{question}}\\nOPTIONS:\\n- {{choices.text|join('\\n- ')}}\\nAnswer:"
|
|
1063
|
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
|
1064
|
+
# BoolQ
|
|
1065
|
+
- group: boolq_flan
|
|
1066
|
+
group_alias: BoolQ
|
|
1067
|
+
aggregate_metric_list:
|
|
1068
|
+
- metric: acc
|
|
1069
|
+
weight_by_size: True
|
|
1070
|
+
task:
|
|
1071
|
+
- task: boolq_prompt-0
|
|
1072
|
+
task_alias: prompt-0
|
|
1073
|
+
include: _held_in_template_yaml
|
|
1074
|
+
doc_to_text: "{{passage}}\\n\\nCan we conclude that {{question}}?\\n\\nOPTIONS:\\n- no\\n- yes"
|
|
1075
|
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
|
1076
|
+
- task: boolq_prompt-1
|
|
1077
|
+
task_alias: prompt-1
|
|
1078
|
+
include: _held_in_template_yaml
|
|
1079
|
+
doc_to_text: "{{passage}}\\n\\nIs it true that {{question}}?\\n\\nOPTIONS:\\n- no\\n- yes"
|
|
1080
|
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
|
1081
|
+
"""
|
|
1082
|
+
|
|
1083
|
+
main_path = os.path.join(tasks_dir, "flan_held_in.yaml")
|
|
1084
|
+
with open(main_path, 'w') as f:
|
|
1085
|
+
f.write(main_content)
|
|
1086
|
+
|
|
1087
|
+
print(f" 💾 Created flan_held_in YAML files:")
|
|
1088
|
+
print(f" 📄 Template: {template_path}")
|
|
1089
|
+
print(f" 📄 Main: {main_path}")
|
|
1090
|
+
|
|
1091
|
+
return main_path
|
|
1092
|
+
|
|
1093
|
+
except Exception as e:
|
|
1094
|
+
print(f" ❌ Failed to create flan_held_in files: {e}")
|
|
1095
|
+
return None
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
def load_task_with_config_dir(task_name: str, config_dir: str):
|
|
1099
|
+
"""
|
|
1100
|
+
Load a task by setting the lm_eval configuration directory.
|
|
1101
|
+
This attempts to load YAML configurations by manipulating the path and environment.
|
|
1102
|
+
|
|
1103
|
+
Args:
|
|
1104
|
+
task_name: Name of the task to load
|
|
1105
|
+
config_dir: Directory containing YAML configuration files
|
|
1106
|
+
|
|
1107
|
+
Returns:
|
|
1108
|
+
Task dictionary from get_task_dict
|
|
1109
|
+
"""
|
|
1110
|
+
try:
|
|
1111
|
+
from lm_eval.tasks import get_task_dict
|
|
1112
|
+
from lm_eval.tasks import TaskManager as LMTaskManager
|
|
1113
|
+
import sys
|
|
1114
|
+
|
|
1115
|
+
print(f" 🔧 Attempting to load {task_name} from config dir: {config_dir}")
|
|
1116
|
+
|
|
1117
|
+
# Method 1: Try to use TaskManager if available
|
|
1118
|
+
try:
|
|
1119
|
+
# Check if LMTaskManager has config path functionality
|
|
1120
|
+
task_manager = LMTaskManager()
|
|
1121
|
+
if hasattr(task_manager, 'initialize_tasks') or hasattr(task_manager, 'load_config'):
|
|
1122
|
+
print(f" 🔧 Using TaskManager approach")
|
|
1123
|
+
return get_task_dict([task_name], task_manager=task_manager)
|
|
1124
|
+
except Exception as e:
|
|
1125
|
+
print(f" ⚠️ TaskManager approach failed: {e}")
|
|
1126
|
+
|
|
1127
|
+
# Method 2: Try adding config directory to Python path
|
|
1128
|
+
original_path = sys.path[:]
|
|
1129
|
+
try:
|
|
1130
|
+
if config_dir not in sys.path:
|
|
1131
|
+
sys.path.insert(0, config_dir)
|
|
1132
|
+
print(f" 🔧 Added config dir to Python path")
|
|
1133
|
+
return get_task_dict([task_name])
|
|
1134
|
+
except Exception as e:
|
|
1135
|
+
print(f" ⚠️ Python path approach failed: {e}")
|
|
1136
|
+
finally:
|
|
1137
|
+
sys.path[:] = original_path
|
|
1138
|
+
|
|
1139
|
+
# Method 3: Try setting environment variables
|
|
1140
|
+
original_env = {}
|
|
1141
|
+
env_vars = ['LM_EVAL_CONFIG_DIR', 'LMEVAL_CONFIG_PATH', 'TASK_CONFIG_PATH']
|
|
1142
|
+
try:
|
|
1143
|
+
for env_var in env_vars:
|
|
1144
|
+
original_env[env_var] = os.environ.get(env_var)
|
|
1145
|
+
os.environ[env_var] = config_dir
|
|
1146
|
+
print(f" 🔧 Set environment variables")
|
|
1147
|
+
return get_task_dict([task_name])
|
|
1148
|
+
except Exception as e:
|
|
1149
|
+
print(f" ⚠️ Environment variable approach failed: {e}")
|
|
1150
|
+
finally:
|
|
1151
|
+
for env_var in env_vars:
|
|
1152
|
+
if original_env[env_var] is None:
|
|
1153
|
+
os.environ.pop(env_var, None)
|
|
1154
|
+
else:
|
|
1155
|
+
os.environ[env_var] = original_env[env_var]
|
|
1156
|
+
|
|
1157
|
+
# Method 4: Fall back to basic loading
|
|
1158
|
+
print(f" 🔧 Falling back to basic task loading")
|
|
1159
|
+
return get_task_dict([task_name])
|
|
1160
|
+
|
|
1161
|
+
except Exception as e:
|
|
1162
|
+
raise Exception(f"Config directory loading failed: {e}")
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
class TaskManager:
|
|
1166
|
+
"""Manages lm-eval task discovery, validation, and loading."""
|
|
1167
|
+
|
|
1168
|
+
def __init__(self):
|
|
1169
|
+
self._available_tasks = None
|
|
1170
|
+
self._task_name_mappings = {}
|
|
1171
|
+
|
|
1172
|
+
@property
|
|
1173
|
+
def available_tasks(self) -> List[str]:
|
|
1174
|
+
"""Get list of available tasks, loading if necessary."""
|
|
1175
|
+
if self._available_tasks is None:
|
|
1176
|
+
self._available_tasks = load_available_tasks()
|
|
1177
|
+
return self._available_tasks
|
|
1178
|
+
|
|
1179
|
+
def get_available_tasks(self) -> List[str]:
|
|
1180
|
+
"""Get list of all available tasks."""
|
|
1181
|
+
return self.available_tasks
|
|
1182
|
+
|
|
1183
|
+
def is_valid_task(self, task_name: str) -> bool:
|
|
1184
|
+
"""Check if a task name is valid."""
|
|
1185
|
+
try:
|
|
1186
|
+
resolved_name = self.resolve_task_name(task_name)
|
|
1187
|
+
return resolved_name in self.available_tasks
|
|
1188
|
+
except ValueError:
|
|
1189
|
+
return False
|
|
1190
|
+
|
|
1191
|
+
def resolve_task_name(self, task_name: str) -> str:
|
|
1192
|
+
"""
|
|
1193
|
+
Resolve a task name to its canonical form, handling variations and common mistakes.
|
|
1194
|
+
|
|
1195
|
+
Args:
|
|
1196
|
+
task_name: The task name to resolve
|
|
1197
|
+
|
|
1198
|
+
Returns:
|
|
1199
|
+
The canonical task name
|
|
1200
|
+
|
|
1201
|
+
Raises:
|
|
1202
|
+
ValueError: If the task name cannot be resolved
|
|
1203
|
+
"""
|
|
1204
|
+
# Direct match
|
|
1205
|
+
if task_name in self.available_tasks:
|
|
1206
|
+
return task_name
|
|
1207
|
+
|
|
1208
|
+
# Check cached mappings
|
|
1209
|
+
if task_name in self._task_name_mappings:
|
|
1210
|
+
return self._task_name_mappings[task_name]
|
|
1211
|
+
|
|
1212
|
+
# Try fuzzy matching
|
|
1213
|
+
best_match = None
|
|
1214
|
+
best_similarity = 0.0
|
|
1215
|
+
similarity_threshold = 0.6
|
|
1216
|
+
|
|
1217
|
+
for available_task in self.available_tasks:
|
|
1218
|
+
similarity = self._calculate_task_name_similarity(task_name, available_task)
|
|
1219
|
+
if similarity > best_similarity and similarity >= similarity_threshold:
|
|
1220
|
+
best_similarity = similarity
|
|
1221
|
+
best_match = available_task
|
|
1222
|
+
|
|
1223
|
+
if best_match:
|
|
1224
|
+
# Cache the mapping
|
|
1225
|
+
self._task_name_mappings[task_name] = best_match
|
|
1226
|
+
return best_match
|
|
1227
|
+
|
|
1228
|
+
# List some suggestions if no match found
|
|
1229
|
+
suggestions = [task for task in self.available_tasks
|
|
1230
|
+
if any(word.lower() in task.lower() for word in task_name.split('_'))][:5]
|
|
1231
|
+
|
|
1232
|
+
raise TaskNotFoundError(
|
|
1233
|
+
task_name=task_name,
|
|
1234
|
+
available_tasks=similar_tasks if similar_tasks else None
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
def _calculate_task_name_similarity(self, name1: str, name2: str) -> float:
|
|
1238
|
+
"""Calculate similarity between two task names."""
|
|
1239
|
+
# Direct similarity
|
|
1240
|
+
base_similarity = SequenceMatcher(None, name1.lower(), name2.lower()).ratio()
|
|
1241
|
+
|
|
1242
|
+
# Bonus for word-level matches
|
|
1243
|
+
words1 = set(re.split(r'[_\-\s]+', name1.lower()))
|
|
1244
|
+
words2 = set(re.split(r'[_\-\s]+', name2.lower()))
|
|
1245
|
+
|
|
1246
|
+
if words1 and words2:
|
|
1247
|
+
word_overlap = len(words1.intersection(words2)) / max(len(words1), len(words2))
|
|
1248
|
+
return (base_similarity + word_overlap) / 2
|
|
1249
|
+
|
|
1250
|
+
return base_similarity
|
|
1251
|
+
|
|
1252
|
+
def load_task(self, task_name: str, limit: Optional[int] = None):
|
|
1253
|
+
"""
|
|
1254
|
+
Load a task from lm-evaluation-harness with dynamic task name resolution.
|
|
1255
|
+
Supports both regular tasks and ConfigurableGroup tasks.
|
|
1256
|
+
|
|
1257
|
+
Args:
|
|
1258
|
+
task_name: Name of the task
|
|
1259
|
+
limit: Optional limit on number of documents
|
|
1260
|
+
|
|
1261
|
+
Returns:
|
|
1262
|
+
Task object from lm_eval
|
|
1263
|
+
"""
|
|
1264
|
+
|
|
1265
|
+
# Find the actual task name dynamically
|
|
1266
|
+
actual_task_name = self.resolve_task_name(task_name)
|
|
1267
|
+
|
|
1268
|
+
try:
|
|
1269
|
+
# First try to handle as potentially problematic ConfigurableGroup task
|
|
1270
|
+
task, _ = handle_configurable_group_task(actual_task_name)
|
|
1271
|
+
task._limit = limit
|
|
1272
|
+
return task
|
|
1273
|
+
|
|
1274
|
+
except Exception as e:
|
|
1275
|
+
# If that fails, check if it's a task resolution issue
|
|
1276
|
+
if not self.is_valid_task(actual_task_name):
|
|
1277
|
+
raise TaskNotFoundError(task_name=task_name)
|
|
1278
|
+
|
|
1279
|
+
# Re-raise the original error if it wasn't a resolution issue
|
|
1280
|
+
raise TaskLoadError(task_name=task_name, cause=e)
|
|
1281
|
+
|
|
1282
|
+
def split_task_data(self, task_data, split_ratio: float = 0.8, random_seed: int = 42) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
1283
|
+
"""
|
|
1284
|
+
Split task data into training and testing sets.
|
|
1285
|
+
|
|
1286
|
+
Args:
|
|
1287
|
+
task_data: Task object from lm_eval
|
|
1288
|
+
split_ratio: Ratio for training split (0.0 to 1.0)
|
|
1289
|
+
random_seed: Random seed for reproducible splits
|
|
1290
|
+
|
|
1291
|
+
Returns:
|
|
1292
|
+
Tuple of (training_docs, testing_docs)
|
|
1293
|
+
"""
|
|
1294
|
+
import random
|
|
1295
|
+
|
|
1296
|
+
# Load documents with limit if specified
|
|
1297
|
+
limit = getattr(task_data, '_limit', None)
|
|
1298
|
+
docs = load_docs(task_data, limit)
|
|
1299
|
+
|
|
1300
|
+
# Shuffle with seed for reproducibility
|
|
1301
|
+
random.seed(random_seed)
|
|
1302
|
+
shuffled_docs = docs.copy()
|
|
1303
|
+
random.shuffle(shuffled_docs)
|
|
1304
|
+
|
|
1305
|
+
# Split based on ratio
|
|
1306
|
+
split_point = int(len(shuffled_docs) * split_ratio)
|
|
1307
|
+
training_docs = shuffled_docs[:split_point]
|
|
1308
|
+
testing_docs = shuffled_docs[split_point:]
|
|
1309
|
+
|
|
1310
|
+
return training_docs, testing_docs
|
|
1311
|
+
|
|
1312
|
+
def prepare_prompts_from_docs(self, task, docs: List[Dict[str, Any]]) -> List[str]:
|
|
1313
|
+
"""
|
|
1314
|
+
Prepare prompts from task documents.
|
|
1315
|
+
|
|
1316
|
+
Args:
|
|
1317
|
+
task: Task object from lm_eval
|
|
1318
|
+
docs: List of documents to convert to prompts
|
|
1319
|
+
|
|
1320
|
+
Returns:
|
|
1321
|
+
List of formatted prompts
|
|
1322
|
+
"""
|
|
1323
|
+
prompts = []
|
|
1324
|
+
|
|
1325
|
+
for doc in docs:
|
|
1326
|
+
try:
|
|
1327
|
+
# Different tasks have different prompt creation methods
|
|
1328
|
+
if hasattr(task, 'doc_to_text'):
|
|
1329
|
+
prompt = task.doc_to_text(doc)
|
|
1330
|
+
elif hasattr(task, 'doc_format'):
|
|
1331
|
+
prompt = task.doc_format(doc)
|
|
1332
|
+
elif 'input' in doc:
|
|
1333
|
+
prompt = doc['input']
|
|
1334
|
+
elif 'question' in doc:
|
|
1335
|
+
prompt = doc['question']
|
|
1336
|
+
elif 'prompt' in doc:
|
|
1337
|
+
prompt = doc['prompt']
|
|
1338
|
+
else:
|
|
1339
|
+
# Fallback: use the first text-like field
|
|
1340
|
+
text_fields = ['text', 'passage', 'context', 'story']
|
|
1341
|
+
prompt = None
|
|
1342
|
+
for field in text_fields:
|
|
1343
|
+
if field in doc and isinstance(doc[field], str):
|
|
1344
|
+
prompt = doc[field]
|
|
1345
|
+
break
|
|
1346
|
+
|
|
1347
|
+
if prompt is None:
|
|
1348
|
+
prompt = str(doc)
|
|
1349
|
+
|
|
1350
|
+
prompts.append(prompt)
|
|
1351
|
+
|
|
1352
|
+
except Exception as e:
|
|
1353
|
+
# Skip problematic documents
|
|
1354
|
+
print(f"Warning: Could not create prompt from document: {e}")
|
|
1355
|
+
continue
|
|
1356
|
+
|
|
1357
|
+
return prompts
|
|
1358
|
+
|
|
1359
|
+
def get_reference_answers(self, task, docs: List[Dict[str, Any]]) -> List[str]:
|
|
1360
|
+
"""
|
|
1361
|
+
Extract reference answers from task documents.
|
|
1362
|
+
|
|
1363
|
+
Args:
|
|
1364
|
+
task: Task object from lm_eval
|
|
1365
|
+
docs: List of documents to extract answers from
|
|
1366
|
+
|
|
1367
|
+
Returns:
|
|
1368
|
+
List of reference answers
|
|
1369
|
+
"""
|
|
1370
|
+
answers = []
|
|
1371
|
+
|
|
1372
|
+
for doc in docs:
|
|
1373
|
+
try:
|
|
1374
|
+
# Different tasks store answers differently
|
|
1375
|
+
if hasattr(task, 'doc_to_target'):
|
|
1376
|
+
answer = task.doc_to_target(doc)
|
|
1377
|
+
elif hasattr(task, 'get_answer'):
|
|
1378
|
+
answer = task.get_answer(doc)
|
|
1379
|
+
elif 'answer' in doc:
|
|
1380
|
+
answer = doc['answer']
|
|
1381
|
+
elif 'target' in doc:
|
|
1382
|
+
answer = doc['target']
|
|
1383
|
+
elif 'label' in doc:
|
|
1384
|
+
answer = doc['label']
|
|
1385
|
+
elif 'output' in doc:
|
|
1386
|
+
answer = doc['output']
|
|
1387
|
+
else:
|
|
1388
|
+
# Look for likely answer fields
|
|
1389
|
+
answer_fields = ['correct_answer', 'gold', 'truth', 'solution']
|
|
1390
|
+
answer = None
|
|
1391
|
+
for field in answer_fields:
|
|
1392
|
+
if field in doc:
|
|
1393
|
+
answer = doc[field]
|
|
1394
|
+
break
|
|
1395
|
+
|
|
1396
|
+
if answer is None:
|
|
1397
|
+
answer = "UNKNOWN"
|
|
1398
|
+
|
|
1399
|
+
answers.append(str(answer))
|
|
1400
|
+
|
|
1401
|
+
except Exception as e:
|
|
1402
|
+
print(f"Warning: Could not extract answer from document: {e}")
|
|
1403
|
+
answers.append("UNKNOWN")
|
|
1404
|
+
|
|
1405
|
+
return answers
|
|
1406
|
+
|
|
1407
|
+
def register_custom_task_yaml(self, task_name: str, yaml_content: str) -> bool:
|
|
1408
|
+
"""
|
|
1409
|
+
Register a custom YAML task configuration that can be loaded later.
|
|
1410
|
+
|
|
1411
|
+
Args:
|
|
1412
|
+
task_name: Name of the task to register
|
|
1413
|
+
yaml_content: YAML content defining the task
|
|
1414
|
+
|
|
1415
|
+
Returns:
|
|
1416
|
+
True if successfully registered, False otherwise
|
|
1417
|
+
|
|
1418
|
+
Example:
|
|
1419
|
+
yaml_content = '''
|
|
1420
|
+
my_custom_task:
|
|
1421
|
+
class: custom_task
|
|
1422
|
+
doc_to_text: "Question: {{question}}"
|
|
1423
|
+
doc_to_target: "{{answer}}"
|
|
1424
|
+
'''
|
|
1425
|
+
manager.register_custom_task_yaml("my_custom_task", yaml_content)
|
|
1426
|
+
"""
|
|
1427
|
+
try:
|
|
1428
|
+
yaml_file_path = create_task_yaml_from_user_content(task_name, yaml_content)
|
|
1429
|
+
if yaml_file_path:
|
|
1430
|
+
print(f"✅ Registered custom task configuration for '{task_name}'")
|
|
1431
|
+
print(f" 📁 Saved to: {yaml_file_path}")
|
|
1432
|
+
return True
|
|
1433
|
+
return False
|
|
1434
|
+
except Exception as e:
|
|
1435
|
+
print(f"❌ Failed to register custom task '{task_name}': {e}")
|
|
1436
|
+
return False
|
|
1437
|
+
|
|
1438
|
+
|
|
1439
|
+
# Global instance for convenience
|
|
1440
|
+
_task_manager = TaskManager()
|
|
1441
|
+
|
|
1442
|
+
# Convenience functions that use the global instance
|
|
1443
|
+
def get_available_tasks() -> List[str]:
|
|
1444
|
+
"""Get list of all available tasks."""
|
|
1445
|
+
return _task_manager.get_available_tasks()
|
|
1446
|
+
|
|
1447
|
+
def is_valid_task(task_name: str) -> bool:
|
|
1448
|
+
"""Check if a task name is valid."""
|
|
1449
|
+
return _task_manager.is_valid_task(task_name)
|
|
1450
|
+
|
|
1451
|
+
def resolve_task_name(task_name: str) -> str:
|
|
1452
|
+
"""Resolve a task name to its canonical form."""
|
|
1453
|
+
return _task_manager.resolve_task_name(task_name)
|