opencompass 0.2.4__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opencompass-0.2.4 → opencompass-0.2.5}/PKG-INFO +12 -13
- {opencompass-0.2.4 → opencompass-0.2.5}/README.md +11 -12
- opencompass-0.2.5/opencompass/__init__.py +1 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/GaokaoBench.py +23 -6
- opencompass-0.2.5/opencompass/datasets/MMLUArabic.py +33 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/__init__.py +6 -0
- opencompass-0.2.5/opencompass/datasets/charm.py +55 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cibench.py +178 -149
- opencompass-0.2.5/opencompass/datasets/drop_simple_eval.py +80 -0
- opencompass-0.2.5/opencompass/datasets/flames.py +57 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/gpqa.py +53 -1
- opencompass-0.2.5/opencompass/datasets/llm_compression.py +36 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/math.py +15 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/mbpp.py +12 -8
- opencompass-0.2.5/opencompass/datasets/mgsm.py +78 -0
- opencompass-0.2.5/opencompass/datasets/s3eval.py +169 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/__init__.py +2 -0
- opencompass-0.2.5/opencompass/datasets/subjective/arena_hard.py +35 -0
- opencompass-0.2.5/opencompass/datasets/subjective/compassbench.py +101 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/taco.py +4 -3
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/winogrande.py +9 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/__init__.py +21 -13
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/ai360_api.py +23 -21
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/baichuan_api.py +1 -1
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/baidu_api.py +26 -9
- opencompass-0.2.4/opencompass/models/minimax_api.py → opencompass-0.2.5/opencompass/models/deepseek_api.py +62 -66
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/gemini_api.py +0 -63
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/huggingface.py +9 -2
- opencompass-0.2.5/opencompass/models/huggingface_above_v4_33.py +440 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/lagent.py +4 -3
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/lightllm_api.py +169 -4
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/lmdeploy_pytorch.py +12 -3
- opencompass-0.2.5/opencompass/models/minimax_api.py +352 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/openai_api.py +14 -141
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/qwen_api.py +1 -2
- opencompass-0.2.5/opencompass/models/stepfun_api.py +182 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/turbomind.py +29 -12
- opencompass-0.2.5/opencompass/models/turbomind_with_tf_above_v4_33.py +195 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/vllm.py +32 -20
- opencompass-0.2.5/opencompass/models/vllm_with_tf_above_v4_33.py +127 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/xunfei_api.py +149 -0
- opencompass-0.2.5/opencompass/models/yi_api.py +178 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/__init__.py +1 -0
- opencompass-0.2.5/opencompass/openicl/icl_evaluator/icl_bpc_evaluator.py +32 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +26 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/lm_evaluator.py +8 -5
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/__init__.py +1 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +2 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py +27 -49
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +35 -67
- opencompass-0.2.5/opencompass/openicl/icl_inferencer/icl_sw_ce_loss_inferencer.py +352 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/partitioners/__init__.py +0 -1
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/partitioners/num_worker.py +5 -3
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/registry.py +0 -8
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/runners/dlc.py +30 -22
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/runners/local.py +10 -8
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/__init__.py +2 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/default.py +11 -9
- opencompass-0.2.5/opencompass/summarizers/llm_compression.py +200 -0
- opencompass-0.2.5/opencompass/summarizers/multi_faceted.py +46 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/needlebench.py +1 -1
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/__init__.py +4 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/all_obj.py +123 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/arenahard.py +309 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/compassbench.py +241 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/flames.py +93 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/tasks/__init__.py +0 -1
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/tasks/openicl_eval.py +6 -2
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/tasks/openicl_infer.py +4 -2
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/tasks/subjective_eval.py +8 -3
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/build.py +0 -1
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/file.py +3 -3
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/prompt.py +9 -2
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/run.py +99 -89
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/text_postprocessors.py +21 -15
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass.egg-info/PKG-INFO +12 -13
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass.egg-info/SOURCES.txt +23 -2
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass.egg-info/requires.txt +2 -0
- opencompass-0.2.4/opencompass/__init__.py +0 -1
- opencompass-0.2.4/opencompass/partitioners/mm_naive.py +0 -119
- opencompass-0.2.4/opencompass/tasks/mm_infer.py +0 -160
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/FinanceIQ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_GCP_D.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_KSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_TSP_D.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_GCP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_MSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_TSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_BSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_EDP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_SPP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/prompts.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/OpenFinData.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/QuALITY.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/TheoremQA/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/TheoremQA/legacy.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/TheoremQA/main.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/TheoremQA/number_utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/TheoremQA/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/advglue.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/afqmcd.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/agieval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/constructions.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/dataset_loader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/evaluation.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/math_equivalence.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/post_process.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/agieval/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/anli.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/anthropics_evals.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/apps.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/arc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/ax.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/bbh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/boolq.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/bustum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/c3.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cb.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/ceval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/chembench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/chid.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/circular.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/civilcomments.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/clozeTest_maxmin.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cluewsc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cmb.py +1 -1
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cmmlu.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cmnli.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cmrc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/commonsenseqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/commonsenseqa_cn.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/copa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/crowspairs.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/crowspairs_cn.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/csl.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/custom.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/cvalues.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/drcd.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/drop.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/ds1000.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/ds1000_interpreter.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/eprstmt.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/flores.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/game24.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/govrepcrs.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/gsm8k.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/gsm_hard.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/hellaswag.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/huggingface.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/humaneval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/humaneval_multi.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/humanevalx.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/hungarian_math.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_codedebug.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_coderun.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_endia.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_enmc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_enqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_ensum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_mathfind.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_zhqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/infinitebench/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/iwslt2017.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/jigsawmultilingual.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/jsonl.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/kaoshi.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lambada.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lawbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lawbench/lawbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lcsts.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/evaluators.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_coursera.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_financial_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_gov_report_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_gsm100.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_legal_contract_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_meeting_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_multidoc_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_narrattive_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_natural_question.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_news_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_paper_assistant.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_patent_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_quality.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_review_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_scientific_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_topic_retrieval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_tpo.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/leval/leval_tvshow_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lmeval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/evaluators.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_2wikim_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_dureader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_gov_report.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_hotpot_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_lcc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_lsht.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multi_news.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multifieldqa_en.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_musique.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_narrative_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_count.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_qasper.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_qmsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_repobench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_samsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_trec.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_trivia_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_vcsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/evaluators.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_cmrc_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_dureader_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_factrecall_en.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_factrecall_zh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_lic_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/mastermath2024v1.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/math401.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/math_intern.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/mathbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/constructions.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/dataset_loader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/evaluation.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/math_equivalence.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/medbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/post_process.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/medbench/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/mmlu.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/multirc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/narrativeqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/natural_question.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/natural_question_cn.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/obqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/piqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/py150.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/qasper.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/qaspercut.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/race.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/realtoxicprompts.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/reasonbench/ReasonBenchDataset.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/reasonbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/record.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/rolebench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/safety.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/scibench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/siqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/squad20.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/storycloze.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/strategyqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/alignbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/compass_arena.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/corev2.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/creationbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/information_retrival.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/mtbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/multiround.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/subjective/subjective_cmp.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/summedits.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/summscreen.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/svamp.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/tabmwp.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/instruct_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/planning_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/review_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/schema.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/utils/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/utils/convert_results.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/utils/format_load.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/utils/meta_template.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/teval/utils/template.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/tnews.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/triviaqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/triviaqarc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/truthfulqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/tydiqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/wic.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/wikibench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/winograd.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/wnli.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/wsc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/xcopa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/xiezhi.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/xlsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/datasets/xsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/metrics/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/metrics/dump_results.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/metrics/mme_score.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/metrics/seedbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/accessory.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/alaya.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/base_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/bytedance_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/claude_api/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/claude_api/claude_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/claude_api/postprocessors.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/glm.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/hunyuan_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/intern_model.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/krgpt_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/langchain.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/llama2.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/lmdeploy_tis.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/mistral_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/mixtral.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/modelscope.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/moonshot_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/nanbeige_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/pangu_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/sensetime_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/turbomind_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/turbomind_tis.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/unigpt_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/yayi_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/zhipuai_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/models/zhipuai_v2_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_dataset_reader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_agent_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_circular_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_jieba_rouge_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_attack_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_mink_percent_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_sc_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_tot_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_prompt_template.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_base_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_bm25_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_dpp_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_fix_k_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_random_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_topk_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_votek_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_zero_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/utils/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/openicl/utils/logging.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/partitioners/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/partitioners/naive.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/partitioners/size.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/partitioners/sub_naive.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/partitioners/sub_size.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/runners/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/runners/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/runners/local_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/runners/slurm.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/runners/slurm_sequential.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/circular.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/multi_model.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/alignmentbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/alpacaeval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/compass_arena.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/corev2.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/creationbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/information_retrival.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/mtbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/multiround.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/subjective_post_process.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/subjective/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/summarizers/summarizer_pretrain.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/tasks/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/tasks/llm_eval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/tasks/openicl_attack.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/abbr.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/auxiliary.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/collect_env.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/dependency.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/fileio.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/lark.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/logging.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/menu.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass/utils/types.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass.egg-info/dependency_links.txt +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass.egg-info/entry_points.txt +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/opencompass.egg-info/top_level.txt +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/setup.cfg +0 -0
- {opencompass-0.2.4 → opencompass-0.2.5}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: opencompass
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: A comprehensive toolkit for large model evaluation
|
|
5
5
|
Home-page: https://github.com/open-compass/opencompass
|
|
6
6
|
Author: OpenCompass Contributors
|
|
@@ -78,6 +78,11 @@ Description: <div align="center">
|
|
|
78
78
|
|
|
79
79
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
80
80
|
|
|
81
|
+
- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
|
|
82
|
+
- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
|
|
83
|
+
- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
|
|
84
|
+
- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
|
|
85
|
+
- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
|
|
81
86
|
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
82
87
|
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
83
88
|
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
@@ -130,7 +135,7 @@ Description: <div align="center">
|
|
|
130
135
|
git clone https://github.com/open-compass/opencompass opencompass
|
|
131
136
|
cd opencompass
|
|
132
137
|
pip install -e .
|
|
133
|
-
# also please install
|
|
138
|
+
# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
|
|
134
139
|
```
|
|
135
140
|
|
|
136
141
|
### 📂 Data Preparation
|
|
@@ -165,19 +170,13 @@ Description: <div align="center">
|
|
|
165
170
|
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
|
|
166
171
|
|
|
167
172
|
```bash
|
|
168
|
-
python run.py --datasets ceval_ppl mmlu_ppl
|
|
169
|
-
--hf-path huggyllama/llama-7b \ # HuggingFace model path
|
|
170
|
-
--model-kwargs device_map='auto' \ # Arguments for model construction
|
|
171
|
-
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
|
|
172
|
-
--max-out-len 100 \ # Maximum number of tokens generated
|
|
173
|
-
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
|
174
|
-
--batch-size 8 \ # Batch size
|
|
175
|
-
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
|
|
176
|
-
--num-gpus 1 # Number of minimum required GPUs
|
|
173
|
+
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
|
|
177
174
|
```
|
|
178
175
|
|
|
179
|
-
>
|
|
180
|
-
>
|
|
176
|
+
> \[!TIP\]
|
|
177
|
+
>
|
|
178
|
+
> configuration with `_ppl` is designed for base model typically.
|
|
179
|
+
> configuration with `_gen` can be used for both base model and chat model.
|
|
181
180
|
|
|
182
181
|
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
|
183
182
|
|
|
@@ -70,6 +70,11 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
|
|
70
70
|
|
|
71
71
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
72
72
|
|
|
73
|
+
- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
|
|
74
|
+
- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
|
|
75
|
+
- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
|
|
76
|
+
- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
|
|
77
|
+
- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
|
|
73
78
|
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
74
79
|
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
75
80
|
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
@@ -122,7 +127,7 @@ conda activate opencompass
|
|
|
122
127
|
git clone https://github.com/open-compass/opencompass opencompass
|
|
123
128
|
cd opencompass
|
|
124
129
|
pip install -e .
|
|
125
|
-
# also please install
|
|
130
|
+
# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
|
|
126
131
|
```
|
|
127
132
|
|
|
128
133
|
### 📂 Data Preparation
|
|
@@ -157,19 +162,13 @@ python tools/list_configs.py llama mmlu
|
|
|
157
162
|
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
|
|
158
163
|
|
|
159
164
|
```bash
|
|
160
|
-
python run.py --datasets ceval_ppl mmlu_ppl
|
|
161
|
-
--hf-path huggyllama/llama-7b \ # HuggingFace model path
|
|
162
|
-
--model-kwargs device_map='auto' \ # Arguments for model construction
|
|
163
|
-
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
|
|
164
|
-
--max-out-len 100 \ # Maximum number of tokens generated
|
|
165
|
-
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
|
166
|
-
--batch-size 8 \ # Batch size
|
|
167
|
-
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
|
|
168
|
-
--num-gpus 1 # Number of minimum required GPUs
|
|
165
|
+
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
|
|
169
166
|
```
|
|
170
167
|
|
|
171
|
-
>
|
|
172
|
-
>
|
|
168
|
+
> \[!TIP\]
|
|
169
|
+
>
|
|
170
|
+
> configuration with `_ppl` is designed for base model typically.
|
|
171
|
+
> configuration with `_gen` can be used for both base model and chat model.
|
|
173
172
|
|
|
174
173
|
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
|
175
174
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.2.5'
|
|
@@ -91,34 +91,51 @@ class GaokaoBenchEvaluator(BaseEvaluator):
|
|
|
91
91
|
]:
|
|
92
92
|
return {'score': 0}
|
|
93
93
|
elif self.question_type == 'multi_choice':
|
|
94
|
+
details = {}
|
|
94
95
|
correct_score, total_score = 0, 0
|
|
95
|
-
for pred, refr in zip(predictions, references):
|
|
96
|
+
for index, (pred, refr) in enumerate(zip(predictions, references)):
|
|
96
97
|
pred = self.do_predictions_postprocess(pred)
|
|
97
98
|
pred = self.ensure_same_length(pred, refr)
|
|
99
|
+
is_corrects = []
|
|
98
100
|
for p, r in zip(pred, refr):
|
|
99
101
|
if p == r:
|
|
100
102
|
correct_score += 2
|
|
103
|
+
is_corrects.append(True)
|
|
101
104
|
else:
|
|
102
105
|
for i in p:
|
|
103
106
|
if i not in r:
|
|
104
107
|
break
|
|
105
108
|
else:
|
|
106
109
|
correct_score += 1
|
|
110
|
+
is_corrects.append(False)
|
|
107
111
|
total_score += 2
|
|
108
|
-
|
|
112
|
+
details[str(index)] = {
|
|
113
|
+
'pred': pred,
|
|
114
|
+
'refr': refr,
|
|
115
|
+
'is_correct': all(is_corrects),
|
|
116
|
+
}
|
|
117
|
+
|
|
109
118
|
else:
|
|
119
|
+
details = {}
|
|
110
120
|
correct_score, total_score = 0, 0
|
|
111
|
-
for pred, refr in zip(predictions, references):
|
|
121
|
+
for index, (pred, refr) in enumerate(zip(predictions, references)):
|
|
112
122
|
if self.question_type == 'multi_question_choice':
|
|
113
123
|
pred = self.do_predictions_postprocess(pred, len(refr))
|
|
114
124
|
else:
|
|
115
125
|
pred = self.do_predictions_postprocess(pred)
|
|
116
126
|
pred = self.ensure_same_length(pred, refr)
|
|
127
|
+
is_corrects = []
|
|
117
128
|
for p, r in zip(pred, refr):
|
|
118
|
-
|
|
119
|
-
|
|
129
|
+
is_correct = p == r
|
|
130
|
+
correct_score += is_correct
|
|
120
131
|
total_score += 1
|
|
121
|
-
|
|
132
|
+
is_corrects.append(is_correct)
|
|
133
|
+
details[str(index)] = {
|
|
134
|
+
'pred': pred,
|
|
135
|
+
'refr': refr,
|
|
136
|
+
'is_correct': all(is_corrects),
|
|
137
|
+
}
|
|
138
|
+
return {'score': correct_score / total_score * 100, 'details': details}
|
|
122
139
|
|
|
123
140
|
|
|
124
141
|
for question_type in valid_gaokao_bench_question_types:
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os.path as osp
|
|
3
|
+
|
|
4
|
+
from datasets import Dataset, DatasetDict
|
|
5
|
+
|
|
6
|
+
from opencompass.registry import LOAD_DATASET
|
|
7
|
+
|
|
8
|
+
from .base import BaseDataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@LOAD_DATASET.register_module()
|
|
12
|
+
class MMLUArabicDataset(BaseDataset):
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def load(path: str, name: str):
|
|
16
|
+
dataset = DatasetDict()
|
|
17
|
+
for split in ['dev', 'test']:
|
|
18
|
+
raw_data = []
|
|
19
|
+
filename = osp.join(path, split, f'{name}_{split}.csv')
|
|
20
|
+
with open(filename, encoding='utf-8') as f:
|
|
21
|
+
reader = csv.reader(f)
|
|
22
|
+
for row in reader:
|
|
23
|
+
assert len(row) == 6
|
|
24
|
+
raw_data.append({
|
|
25
|
+
'input': row[0],
|
|
26
|
+
'A': row[1],
|
|
27
|
+
'B': row[2],
|
|
28
|
+
'C': row[3],
|
|
29
|
+
'D': row[4],
|
|
30
|
+
'target': row[5],
|
|
31
|
+
})
|
|
32
|
+
dataset[split] = Dataset.from_list(raw_data)
|
|
33
|
+
return dataset
|
|
@@ -12,6 +12,7 @@ from .bustum import * # noqa: F401, F403
|
|
|
12
12
|
from .c3 import * # noqa: F401, F403
|
|
13
13
|
from .cb import * # noqa: F401, F403
|
|
14
14
|
from .ceval import * # noqa: F401, F403
|
|
15
|
+
from .charm import * # noqa: F401, F403
|
|
15
16
|
from .chembench import * # noqa: F401, F403
|
|
16
17
|
from .chid import * # noqa: F401, F403
|
|
17
18
|
from .cibench import * # noqa: F401, F403
|
|
@@ -33,10 +34,12 @@ from .custom import * # noqa: F401, F403
|
|
|
33
34
|
from .cvalues import * # noqa: F401, F403
|
|
34
35
|
from .drcd import * # noqa: F401, F403
|
|
35
36
|
from .drop import * # noqa: F401, F403
|
|
37
|
+
from .drop_simple_eval import * # noqa: F401, F403
|
|
36
38
|
from .ds1000 import * # noqa: F401, F403
|
|
37
39
|
from .ds1000_interpreter import * # noqa: F401, F403
|
|
38
40
|
from .eprstmt import * # noqa: F401, F403
|
|
39
41
|
from .FinanceIQ import * # noqa: F401, F403
|
|
42
|
+
from .flames import * # noqa: F401, F403
|
|
40
43
|
from .flores import * # noqa: F401, F403
|
|
41
44
|
from .game24 import * # noqa: F401, F403
|
|
42
45
|
from .GaokaoBench import * # noqa: F401, F403
|
|
@@ -59,6 +62,7 @@ from .lambada import * # noqa: F401, F403
|
|
|
59
62
|
from .lawbench import * # noqa: F401, F403
|
|
60
63
|
from .lcsts import * # noqa: F401, F403
|
|
61
64
|
from .leval import * # noqa: F401, F403
|
|
65
|
+
from .llm_compression import LLMCompressionDataset # noqa: F401, F403
|
|
62
66
|
from .longbench import * # noqa: F401, F403
|
|
63
67
|
from .lveval import * # noqa: F401, F403
|
|
64
68
|
from .mastermath2024v1 import * # noqa: F401, F403
|
|
@@ -68,7 +72,9 @@ from .math_intern import * # noqa: F401, F403
|
|
|
68
72
|
from .mathbench import * # noqa: F401, F403
|
|
69
73
|
from .mbpp import * # noqa: F401, F403
|
|
70
74
|
from .medbench import * # noqa: F401, F403
|
|
75
|
+
from .mgsm import * # noqa: F401, F403
|
|
71
76
|
from .mmlu import * # noqa: F401, F403
|
|
77
|
+
from .MMLUArabic import * # noqa: F401, F403
|
|
72
78
|
from .multirc import * # noqa: F401, F403
|
|
73
79
|
from .narrativeqa import * # noqa: F401, F403
|
|
74
80
|
from .natural_question import * # noqa: F401, F403
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os.path as osp
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from datasets import Dataset
|
|
6
|
+
|
|
7
|
+
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
8
|
+
from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
|
|
9
|
+
TEXT_POSTPROCESSORS)
|
|
10
|
+
|
|
11
|
+
from .base import BaseDataset
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@TEXT_POSTPROCESSORS.register_module('charm-reason')
|
|
15
|
+
def charm_reason_postprocess(text: str) -> str:
|
|
16
|
+
ans = text
|
|
17
|
+
ans_line = ans.split('answer is ')
|
|
18
|
+
if len(ans_line) != 1:
|
|
19
|
+
ans = ans_line[1].strip()
|
|
20
|
+
match = re.search(r'\(([A-Z])\)*', ans)
|
|
21
|
+
if match:
|
|
22
|
+
return match.group(1)
|
|
23
|
+
match = re.search(r'([A-Z])', ans)
|
|
24
|
+
if match:
|
|
25
|
+
return match.group(1)
|
|
26
|
+
return ans
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@ICL_EVALUATORS.register_module()
|
|
30
|
+
class CharmReasonEvaluator(BaseEvaluator):
|
|
31
|
+
|
|
32
|
+
def score(self, predictions, references):
|
|
33
|
+
if len(predictions) != len(references):
|
|
34
|
+
return {'error': 'preds and refrs have different length'}
|
|
35
|
+
details = []
|
|
36
|
+
cnt = 0
|
|
37
|
+
for pred, ref in zip(predictions, references):
|
|
38
|
+
detail = {'pred': pred, 'answer': ref, 'correct': False}
|
|
39
|
+
if pred == ref:
|
|
40
|
+
cnt += 1
|
|
41
|
+
detail['correct'] = True
|
|
42
|
+
details.append(detail)
|
|
43
|
+
score = cnt / len(predictions) * 100
|
|
44
|
+
return {'score': score, 'details': details}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@LOAD_DATASET.register_module()
|
|
48
|
+
class CharmDataset(BaseDataset):
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def load(path: str, name: str):
|
|
52
|
+
with open(osp.join(path, f'{name}.json'), 'r') as f:
|
|
53
|
+
data = json.load(f)['examples']
|
|
54
|
+
dataset = Dataset.from_list(data)
|
|
55
|
+
return dataset
|