opencompass 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opencompass-0.2.3 → opencompass-0.2.5}/PKG-INFO +68 -21
- {opencompass-0.2.3 → opencompass-0.2.5}/README.md +67 -20
- opencompass-0.2.5/opencompass/__init__.py +1 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/GaokaoBench.py +23 -6
- opencompass-0.2.5/opencompass/datasets/MMLUArabic.py +33 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_GCP_D.py +5 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_TSP_D.py +5 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_SPP.py +5 -1
- opencompass-0.2.5/opencompass/datasets/QuALITY.py +59 -0
- opencompass-0.2.5/opencompass/datasets/TheoremQA/__init__.py +4 -0
- opencompass-0.2.3/opencompass/datasets/TheoremQA.py → opencompass-0.2.5/opencompass/datasets/TheoremQA/legacy.py +1 -1
- opencompass-0.2.5/opencompass/datasets/TheoremQA/main.py +66 -0
- opencompass-0.2.5/opencompass/datasets/TheoremQA/number_utils.py +98 -0
- opencompass-0.2.5/opencompass/datasets/TheoremQA/utils.py +110 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/__init__.py +10 -0
- opencompass-0.2.5/opencompass/datasets/apps.py +877 -0
- opencompass-0.2.5/opencompass/datasets/charm.py +55 -0
- opencompass-0.2.5/opencompass/datasets/chembench.py +34 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cibench.py +178 -149
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/custom.py +10 -0
- opencompass-0.2.5/opencompass/datasets/drop_simple_eval.py +80 -0
- opencompass-0.2.5/opencompass/datasets/flames.py +57 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/gpqa.py +53 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/humanevalx.py +4 -1
- opencompass-0.2.5/opencompass/datasets/llm_compression.py +36 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/math.py +34 -6
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mathbench.py +1 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mbpp.py +53 -44
- opencompass-0.2.5/opencompass/datasets/mgsm.py +78 -0
- opencompass-0.2.5/opencompass/datasets/s3eval.py +169 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/__init__.py +2 -0
- opencompass-0.2.5/opencompass/datasets/subjective/arena_hard.py +35 -0
- opencompass-0.2.5/opencompass/datasets/subjective/compassbench.py +101 -0
- opencompass-0.2.5/opencompass/datasets/taco.py +824 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/winogrande.py +9 -0
- opencompass-0.2.5/opencompass/models/__init__.py +47 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/ai360_api.py +27 -25
- opencompass-0.2.5/opencompass/models/baichuan_api.py +283 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/baidu_api.py +30 -13
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/base.py +2 -2
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/base_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/bytedance_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/claude_api/claude_api.py +4 -4
- opencompass-0.2.3/opencompass/models/minimax_api.py → opencompass-0.2.5/opencompass/models/deepseek_api.py +66 -70
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/gemini_api.py +4 -67
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/huggingface.py +10 -3
- opencompass-0.2.5/opencompass/models/huggingface_above_v4_33.py +440 -0
- opencompass-0.2.5/opencompass/models/hunyuan_api.py +121 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/lagent.py +4 -3
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/lightllm_api.py +169 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/llama2.py +1 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/lmdeploy_pytorch.py +14 -5
- opencompass-0.2.5/opencompass/models/lmdeploy_tis.py +200 -0
- opencompass-0.2.5/opencompass/models/minimax_api.py +352 -0
- opencompass-0.2.5/opencompass/models/mistral_api.py +123 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/moonshot_api.py +24 -26
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/nanbeige_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/openai_api.py +44 -147
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/pangu_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/qwen_api.py +27 -14
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/sensetime_api.py +14 -9
- opencompass-0.2.5/opencompass/models/stepfun_api.py +182 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/turbomind.py +59 -14
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/turbomind_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/turbomind_tis.py +10 -4
- opencompass-0.2.5/opencompass/models/turbomind_with_tf_above_v4_33.py +195 -0
- opencompass-0.2.5/opencompass/models/unigpt_api.py +147 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/vllm.py +38 -20
- opencompass-0.2.5/opencompass/models/vllm_with_tf_above_v4_33.py +127 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/xunfei_api.py +153 -4
- opencompass-0.2.5/opencompass/models/yayi_api.py +261 -0
- opencompass-0.2.3/opencompass/models/baichuan_api.py → opencompass-0.2.5/opencompass/models/yi_api.py +67 -48
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/zhipuai_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/zhipuai_v2_api.py +12 -6
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/__init__.py +1 -0
- opencompass-0.2.5/opencompass/openicl/icl_evaluator/icl_bpc_evaluator.py +32 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +26 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/lm_evaluator.py +65 -28
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/__init__.py +1 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +2 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +2 -14
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py +27 -49
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +35 -67
- opencompass-0.2.5/opencompass/openicl/icl_inferencer/icl_sw_ce_loss_inferencer.py +352 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_prompt_template.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/__init__.py +0 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/base.py +18 -7
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/num_worker.py +5 -3
- opencompass-0.2.5/opencompass/partitioners/sub_naive.py +220 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/sub_size.py +29 -6
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/registry.py +15 -9
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/base.py +2 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/dlc.py +57 -22
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/local.py +18 -2
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/__init__.py +2 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/default.py +10 -8
- opencompass-0.2.5/opencompass/summarizers/llm_compression.py +200 -0
- opencompass-0.2.5/opencompass/summarizers/multi_faceted.py +46 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/needlebench.py +234 -173
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/__init__.py +4 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/alignmentbench.py +43 -33
- opencompass-0.2.3/opencompass/summarizers/subjective/mtbench.py → opencompass-0.2.5/opencompass/summarizers/subjective/all_obj.py +31 -54
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/alpacaeval.py +2 -1
- opencompass-0.2.5/opencompass/summarizers/subjective/arenahard.py +309 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/compass_arena.py +240 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/compassbench.py +241 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/flames.py +93 -0
- opencompass-0.2.5/opencompass/summarizers/subjective/mtbench.py +153 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/multiround.py +2 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/__init__.py +0 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/openicl_eval.py +9 -4
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/openicl_infer.py +16 -7
- opencompass-0.2.5/opencompass/tasks/subjective_eval.py +443 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/abbr.py +22 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/file.py +3 -3
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/prompt.py +14 -7
- opencompass-0.2.5/opencompass/utils/run.py +350 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/text_postprocessors.py +21 -15
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/PKG-INFO +68 -21
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/SOURCES.txt +38 -3
- opencompass-0.2.5/opencompass.egg-info/entry_points.txt +3 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/requires.txt +4 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/setup.py +40 -33
- opencompass-0.2.3/opencompass/__init__.py +0 -1
- opencompass-0.2.3/opencompass/models/__init__.py +0 -34
- opencompass-0.2.3/opencompass/partitioners/mm_naive.py +0 -119
- opencompass-0.2.3/opencompass/partitioners/sub_naive.py +0 -110
- opencompass-0.2.3/opencompass/summarizers/subjective/compass_arena.py +0 -204
- opencompass-0.2.3/opencompass/tasks/mm_infer.py +0 -160
- opencompass-0.2.3/opencompass/tasks/subjective_eval.py +0 -282
- opencompass-0.2.3/opencompass/utils/run.py +0 -212
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/FinanceIQ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/cmp_KSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_GCP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_MSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/hard_TSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_BSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/p_EDP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/prompts.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/NPHardEval/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/OpenFinData.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/advglue.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/afqmcd.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/agieval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/constructions.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/dataset_loader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/evaluation.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/math_equivalence.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/post_process.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/agieval/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/anli.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/anthropics_evals.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/arc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ax.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/base.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/bbh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/boolq.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/bustum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/c3.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cb.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ceval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/chid.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/circular.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/civilcomments.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/clozeTest_maxmin.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cluewsc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmb.py +1 -1
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmmlu.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmnli.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cmrc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/commonsenseqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/commonsenseqa_cn.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/copa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/crowspairs.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/crowspairs_cn.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/csl.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/cvalues.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/drcd.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/drop.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ds1000.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/ds1000_interpreter.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/eprstmt.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/flores.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/game24.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/govrepcrs.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/gsm8k.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/gsm_hard.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/hellaswag.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/huggingface.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/humaneval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/humaneval_multi.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/hungarian_math.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_codedebug.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_coderun.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_endia.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_enmc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_enqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_ensum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_mathfind.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/infinitebench_zhqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/infinitebench/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/iwslt2017.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/jigsawmultilingual.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/jsonl.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/kaoshi.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lambada.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lawbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lawbench/lawbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lcsts.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/evaluators.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_coursera.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_financial_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_gov_report_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_gsm100.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_legal_contract_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_meeting_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_multidoc_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_narrattive_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_natural_question.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_news_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_paper_assistant.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_patent_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_quality.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_review_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_scientific_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_topic_retrieval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_tpo.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/leval/leval_tvshow_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lmeval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/evaluators.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_2wikim_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_dureader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_gov_report.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_hotpot_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_lcc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_lsht.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multi_news.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multifieldqa_en.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_musique.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_narrative_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_count.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_qasper.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_qmsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_repobench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_samsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_trec.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_trivia_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/longbench/longbench_vcsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/evaluators.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_cmrc_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_dureader_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_factrecall_en.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_factrecall_zh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_lic_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mastermath2024v1.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/math401.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/math_intern.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/constructions.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/dataset_loader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/evaluation.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/math_equivalence.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/medbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/post_process.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/medbench/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/mmlu.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/multirc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/narrativeqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/natural_question.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/natural_question_cn.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/obqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/piqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/py150.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/qasper.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/qaspercut.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/race.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/realtoxicprompts.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/reasonbench/ReasonBenchDataset.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/reasonbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/record.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/rolebench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/safety.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/scibench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/siqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/squad20.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/storycloze.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/strategyqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/alignbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/compass_arena.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/corev2.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/creationbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/information_retrival.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/mtbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/multiround.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/subjective/subjective_cmp.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/summedits.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/summscreen.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/svamp.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/tabmwp.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/instruct_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/planning_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/evaluators/review_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/schema.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/convert_results.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/format_load.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/meta_template.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/teval/utils/template.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/tnews.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/triviaqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/triviaqarc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/truthfulqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/tydiqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wic.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wikibench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/winograd.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wnli.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/wsc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xcopa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xiezhi.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xlsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/datasets/xsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/dump_results.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/mme_score.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/metrics/seedbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/accessory.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/alaya.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/claude_api/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/claude_api/postprocessors.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/glm.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/intern_model.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/krgpt_api.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/langchain.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/mixtral.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/models/modelscope.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_dataset_reader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_agent_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_circular_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_jieba_rouge_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_attack_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_mink_percent_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_sc_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_inferencer/icl_tot_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_base_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_bm25_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_dpp_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_fix_k_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_random_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_topk_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_votek_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/icl_retriever/icl_zero_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/utils/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/openicl/utils/logging.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/naive.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/partitioners/size.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/local_api.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/slurm.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/runners/slurm_sequential.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/circular.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/multi_model.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/corev2.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/creationbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/information_retrival.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/subjective_post_process.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/subjective/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/summarizers/summarizer_pretrain.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/base.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/llm_eval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/tasks/openicl_attack.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/auxiliary.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/build.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/collect_env.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/dependency.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/fileio.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/lark.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/logging.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/menu.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass/utils/types.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/dependency_links.txt +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/opencompass.egg-info/top_level.txt +0 -0
- {opencompass-0.2.3 → opencompass-0.2.5}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: opencompass
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: A comprehensive toolkit for large model evaluation
|
|
5
5
|
Home-page: https://github.com/open-compass/opencompass
|
|
6
6
|
Author: OpenCompass Contributors
|
|
@@ -11,8 +11,13 @@ Description: <div align="center">
|
|
|
11
11
|
<br />
|
|
12
12
|
<br />
|
|
13
13
|
|
|
14
|
-
[![
|
|
15
|
-
[![
|
|
14
|
+
[![][github-release-shield]][github-release-link]
|
|
15
|
+
[![][github-releasedate-shield]][github-releasedate-link]
|
|
16
|
+
[![][github-contributors-shield]][github-contributors-link]<br>
|
|
17
|
+
[![][github-forks-shield]][github-forks-link]
|
|
18
|
+
[![][github-stars-shield]][github-stars-link]
|
|
19
|
+
[![][github-issues-shield]][github-issues-link]
|
|
20
|
+
[![][github-license-shield]][github-license-link]
|
|
16
21
|
|
|
17
22
|
<!-- [](https://pypi.org/project/opencompass/) -->
|
|
18
23
|
|
|
@@ -25,12 +30,18 @@ Description: <div align="center">
|
|
|
25
30
|
|
|
26
31
|
English | [简体中文](README_zh-CN.md)
|
|
27
32
|
|
|
33
|
+
[![][github-trending-shield]][github-trending-url]
|
|
34
|
+
|
|
28
35
|
</div>
|
|
29
36
|
|
|
30
37
|
<p align="center">
|
|
31
38
|
👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
|
|
32
39
|
</p>
|
|
33
40
|
|
|
41
|
+
> \[!IMPORTANT\]
|
|
42
|
+
>
|
|
43
|
+
> **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
|
|
44
|
+
|
|
34
45
|
## 📣 OpenCompass 2.0
|
|
35
46
|
|
|
36
47
|
We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
|
|
@@ -42,6 +53,14 @@ Description: <div align="center">
|
|
|
42
53
|
|
|
43
54
|
**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
|
|
44
55
|
|
|
56
|
+
<details>
|
|
57
|
+
<summary><kbd>Star History</kbd></summary>
|
|
58
|
+
<picture>
|
|
59
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
|
|
60
|
+
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
|
|
61
|
+
</picture>
|
|
62
|
+
</details>
|
|
63
|
+
|
|
45
64
|
## 🧭 Welcome
|
|
46
65
|
|
|
47
66
|
to **OpenCompass**!
|
|
@@ -59,12 +78,14 @@ Description: <div align="center">
|
|
|
59
78
|
|
|
60
79
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
61
80
|
|
|
62
|
-
- **\[2024.
|
|
63
|
-
- **\[2024.
|
|
64
|
-
- **\[2024.
|
|
65
|
-
- **\[2024.
|
|
66
|
-
- **\[
|
|
67
|
-
- **\[
|
|
81
|
+
- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
|
|
82
|
+
- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
|
|
83
|
+
- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
|
|
84
|
+
- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
|
|
85
|
+
- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
|
|
86
|
+
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
87
|
+
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
88
|
+
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
68
89
|
|
|
69
90
|
> [More](docs/en/notes/news.md)
|
|
70
91
|
|
|
@@ -114,7 +135,7 @@ Description: <div align="center">
|
|
|
114
135
|
git clone https://github.com/open-compass/opencompass opencompass
|
|
115
136
|
cd opencompass
|
|
116
137
|
pip install -e .
|
|
117
|
-
# also please install
|
|
138
|
+
# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
|
|
118
139
|
```
|
|
119
140
|
|
|
120
141
|
### 📂 Data Preparation
|
|
@@ -149,19 +170,13 @@ Description: <div align="center">
|
|
|
149
170
|
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
|
|
150
171
|
|
|
151
172
|
```bash
|
|
152
|
-
python run.py --datasets ceval_ppl mmlu_ppl
|
|
153
|
-
--hf-path huggyllama/llama-7b \ # HuggingFace model path
|
|
154
|
-
--model-kwargs device_map='auto' \ # Arguments for model construction
|
|
155
|
-
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
|
|
156
|
-
--max-out-len 100 \ # Maximum number of tokens generated
|
|
157
|
-
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
|
158
|
-
--batch-size 8 \ # Batch size
|
|
159
|
-
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
|
|
160
|
-
--num-gpus 1 # Number of minimum required GPUs
|
|
173
|
+
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
|
|
161
174
|
```
|
|
162
175
|
|
|
163
|
-
>
|
|
164
|
-
>
|
|
176
|
+
> \[!TIP\]
|
|
177
|
+
>
|
|
178
|
+
> configuration with `_ppl` is designed for base model typically.
|
|
179
|
+
> configuration with `_gen` can be used for both base model and chat model.
|
|
165
180
|
|
|
166
181
|
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
|
167
182
|
|
|
@@ -447,6 +462,7 @@ Description: <div align="center">
|
|
|
447
462
|
|
|
448
463
|
- [InternLM](https://github.com/InternLM/InternLM)
|
|
449
464
|
- [LLaMA](https://github.com/facebookresearch/llama)
|
|
465
|
+
- [LLaMA3](https://github.com/meta-llama/llama3)
|
|
450
466
|
- [Vicuna](https://github.com/lm-sys/FastChat)
|
|
451
467
|
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
|
452
468
|
- [Baichuan](https://github.com/baichuan-inc)
|
|
@@ -505,6 +521,20 @@ Description: <div align="center">
|
|
|
505
521
|
|
|
506
522
|
We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
|
|
507
523
|
|
|
524
|
+
<!-- Copy-paste in your Readme.md file -->
|
|
525
|
+
|
|
526
|
+
<!-- Made with [OSS Insight](https://ossinsight.io/) -->
|
|
527
|
+
|
|
528
|
+
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
|
|
529
|
+
<table>
|
|
530
|
+
<tr>
|
|
531
|
+
<th colspan="2">
|
|
532
|
+
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
|
|
533
|
+
</th>
|
|
534
|
+
</tr>
|
|
535
|
+
</table>
|
|
536
|
+
</a>
|
|
537
|
+
|
|
508
538
|
## 🤝 Acknowledgements
|
|
509
539
|
|
|
510
540
|
Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
|
|
@@ -524,6 +554,23 @@ Description: <div align="center">
|
|
|
524
554
|
|
|
525
555
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
|
526
556
|
|
|
557
|
+
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
|
|
558
|
+
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
|
|
559
|
+
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
|
|
560
|
+
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
|
|
561
|
+
[github-issues-link]: https://github.com/open-compass/opencompass/issues
|
|
562
|
+
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
|
|
563
|
+
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
|
|
564
|
+
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
|
|
565
|
+
[github-release-link]: https://github.com/open-compass/opencompass/releases
|
|
566
|
+
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
|
|
567
|
+
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
|
|
568
|
+
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
|
|
569
|
+
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
|
|
570
|
+
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
|
|
571
|
+
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
|
|
572
|
+
[github-trending-url]: https://trendshift.io/repositories/6630
|
|
573
|
+
|
|
527
574
|
Keywords: AI,NLP,in-context learning,large language model,evaluation,benchmark,llm
|
|
528
575
|
Platform: UNKNOWN
|
|
529
576
|
Classifier: Programming Language :: Python :: 3.8
|
|
@@ -3,8 +3,13 @@
|
|
|
3
3
|
<br />
|
|
4
4
|
<br />
|
|
5
5
|
|
|
6
|
-
[![
|
|
7
|
-
[![
|
|
6
|
+
[![][github-release-shield]][github-release-link]
|
|
7
|
+
[![][github-releasedate-shield]][github-releasedate-link]
|
|
8
|
+
[![][github-contributors-shield]][github-contributors-link]<br>
|
|
9
|
+
[![][github-forks-shield]][github-forks-link]
|
|
10
|
+
[![][github-stars-shield]][github-stars-link]
|
|
11
|
+
[![][github-issues-shield]][github-issues-link]
|
|
12
|
+
[![][github-license-shield]][github-license-link]
|
|
8
13
|
|
|
9
14
|
<!-- [](https://pypi.org/project/opencompass/) -->
|
|
10
15
|
|
|
@@ -17,12 +22,18 @@
|
|
|
17
22
|
|
|
18
23
|
English | [简体中文](README_zh-CN.md)
|
|
19
24
|
|
|
25
|
+
[![][github-trending-shield]][github-trending-url]
|
|
26
|
+
|
|
20
27
|
</div>
|
|
21
28
|
|
|
22
29
|
<p align="center">
|
|
23
30
|
👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
|
|
24
31
|
</p>
|
|
25
32
|
|
|
33
|
+
> \[!IMPORTANT\]
|
|
34
|
+
>
|
|
35
|
+
> **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
|
|
36
|
+
|
|
26
37
|
## 📣 OpenCompass 2.0
|
|
27
38
|
|
|
28
39
|
We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
|
|
@@ -34,6 +45,14 @@ We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three
|
|
|
34
45
|
|
|
35
46
|
**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
|
|
36
47
|
|
|
48
|
+
<details>
|
|
49
|
+
<summary><kbd>Star History</kbd></summary>
|
|
50
|
+
<picture>
|
|
51
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
|
|
52
|
+
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
|
|
53
|
+
</picture>
|
|
54
|
+
</details>
|
|
55
|
+
|
|
37
56
|
## 🧭 Welcome
|
|
38
57
|
|
|
39
58
|
to **OpenCompass**!
|
|
@@ -51,12 +70,14 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
|
|
51
70
|
|
|
52
71
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
53
72
|
|
|
54
|
-
- **\[2024.
|
|
55
|
-
- **\[2024.
|
|
56
|
-
- **\[2024.
|
|
57
|
-
- **\[2024.
|
|
58
|
-
- **\[
|
|
59
|
-
- **\[
|
|
73
|
+
- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
|
|
74
|
+
- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
|
|
75
|
+
- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
|
|
76
|
+
- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
|
|
77
|
+
- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
|
|
78
|
+
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
79
|
+
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
80
|
+
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
60
81
|
|
|
61
82
|
> [More](docs/en/notes/news.md)
|
|
62
83
|
|
|
@@ -106,7 +127,7 @@ conda activate opencompass
|
|
|
106
127
|
git clone https://github.com/open-compass/opencompass opencompass
|
|
107
128
|
cd opencompass
|
|
108
129
|
pip install -e .
|
|
109
|
-
# also please install
|
|
130
|
+
# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
|
|
110
131
|
```
|
|
111
132
|
|
|
112
133
|
### 📂 Data Preparation
|
|
@@ -141,19 +162,13 @@ python tools/list_configs.py llama mmlu
|
|
|
141
162
|
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
|
|
142
163
|
|
|
143
164
|
```bash
|
|
144
|
-
python run.py --datasets ceval_ppl mmlu_ppl
|
|
145
|
-
--hf-path huggyllama/llama-7b \ # HuggingFace model path
|
|
146
|
-
--model-kwargs device_map='auto' \ # Arguments for model construction
|
|
147
|
-
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
|
|
148
|
-
--max-out-len 100 \ # Maximum number of tokens generated
|
|
149
|
-
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
|
150
|
-
--batch-size 8 \ # Batch size
|
|
151
|
-
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
|
|
152
|
-
--num-gpus 1 # Number of minimum required GPUs
|
|
165
|
+
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
|
|
153
166
|
```
|
|
154
167
|
|
|
155
|
-
>
|
|
156
|
-
>
|
|
168
|
+
> \[!TIP\]
|
|
169
|
+
>
|
|
170
|
+
> configuration with `_ppl` is designed for base model typically.
|
|
171
|
+
> configuration with `_gen` can be used for both base model and chat model.
|
|
157
172
|
|
|
158
173
|
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
|
159
174
|
|
|
@@ -439,6 +454,7 @@ Through the command line or configuration files, OpenCompass also supports evalu
|
|
|
439
454
|
|
|
440
455
|
- [InternLM](https://github.com/InternLM/InternLM)
|
|
441
456
|
- [LLaMA](https://github.com/facebookresearch/llama)
|
|
457
|
+
- [LLaMA3](https://github.com/meta-llama/llama3)
|
|
442
458
|
- [Vicuna](https://github.com/lm-sys/FastChat)
|
|
443
459
|
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
|
444
460
|
- [Baichuan](https://github.com/baichuan-inc)
|
|
@@ -497,6 +513,20 @@ Through the command line or configuration files, OpenCompass also supports evalu
|
|
|
497
513
|
|
|
498
514
|
We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
|
|
499
515
|
|
|
516
|
+
<!-- Copy-paste in your Readme.md file -->
|
|
517
|
+
|
|
518
|
+
<!-- Made with [OSS Insight](https://ossinsight.io/) -->
|
|
519
|
+
|
|
520
|
+
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
|
|
521
|
+
<table>
|
|
522
|
+
<tr>
|
|
523
|
+
<th colspan="2">
|
|
524
|
+
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
|
|
525
|
+
</th>
|
|
526
|
+
</tr>
|
|
527
|
+
</table>
|
|
528
|
+
</a>
|
|
529
|
+
|
|
500
530
|
## 🤝 Acknowledgements
|
|
501
531
|
|
|
502
532
|
Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
|
|
@@ -515,3 +545,20 @@ Some datasets and prompt implementations are modified from [chain-of-thought-hub
|
|
|
515
545
|
```
|
|
516
546
|
|
|
517
547
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
|
548
|
+
|
|
549
|
+
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
|
|
550
|
+
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
|
|
551
|
+
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
|
|
552
|
+
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
|
|
553
|
+
[github-issues-link]: https://github.com/open-compass/opencompass/issues
|
|
554
|
+
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
|
|
555
|
+
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
|
|
556
|
+
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
|
|
557
|
+
[github-release-link]: https://github.com/open-compass/opencompass/releases
|
|
558
|
+
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
|
|
559
|
+
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
|
|
560
|
+
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
|
|
561
|
+
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
|
|
562
|
+
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
|
|
563
|
+
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
|
|
564
|
+
[github-trending-url]: https://trendshift.io/repositories/6630
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.2.5'
|
|
@@ -91,34 +91,51 @@ class GaokaoBenchEvaluator(BaseEvaluator):
|
|
|
91
91
|
]:
|
|
92
92
|
return {'score': 0}
|
|
93
93
|
elif self.question_type == 'multi_choice':
|
|
94
|
+
details = {}
|
|
94
95
|
correct_score, total_score = 0, 0
|
|
95
|
-
for pred, refr in zip(predictions, references):
|
|
96
|
+
for index, (pred, refr) in enumerate(zip(predictions, references)):
|
|
96
97
|
pred = self.do_predictions_postprocess(pred)
|
|
97
98
|
pred = self.ensure_same_length(pred, refr)
|
|
99
|
+
is_corrects = []
|
|
98
100
|
for p, r in zip(pred, refr):
|
|
99
101
|
if p == r:
|
|
100
102
|
correct_score += 2
|
|
103
|
+
is_corrects.append(True)
|
|
101
104
|
else:
|
|
102
105
|
for i in p:
|
|
103
106
|
if i not in r:
|
|
104
107
|
break
|
|
105
108
|
else:
|
|
106
109
|
correct_score += 1
|
|
110
|
+
is_corrects.append(False)
|
|
107
111
|
total_score += 2
|
|
108
|
-
|
|
112
|
+
details[str(index)] = {
|
|
113
|
+
'pred': pred,
|
|
114
|
+
'refr': refr,
|
|
115
|
+
'is_correct': all(is_corrects),
|
|
116
|
+
}
|
|
117
|
+
|
|
109
118
|
else:
|
|
119
|
+
details = {}
|
|
110
120
|
correct_score, total_score = 0, 0
|
|
111
|
-
for pred, refr in zip(predictions, references):
|
|
121
|
+
for index, (pred, refr) in enumerate(zip(predictions, references)):
|
|
112
122
|
if self.question_type == 'multi_question_choice':
|
|
113
123
|
pred = self.do_predictions_postprocess(pred, len(refr))
|
|
114
124
|
else:
|
|
115
125
|
pred = self.do_predictions_postprocess(pred)
|
|
116
126
|
pred = self.ensure_same_length(pred, refr)
|
|
127
|
+
is_corrects = []
|
|
117
128
|
for p, r in zip(pred, refr):
|
|
118
|
-
|
|
119
|
-
|
|
129
|
+
is_correct = p == r
|
|
130
|
+
correct_score += is_correct
|
|
120
131
|
total_score += 1
|
|
121
|
-
|
|
132
|
+
is_corrects.append(is_correct)
|
|
133
|
+
details[str(index)] = {
|
|
134
|
+
'pred': pred,
|
|
135
|
+
'refr': refr,
|
|
136
|
+
'is_correct': all(is_corrects),
|
|
137
|
+
}
|
|
138
|
+
return {'score': correct_score / total_score * 100, 'details': details}
|
|
122
139
|
|
|
123
140
|
|
|
124
141
|
for question_type in valid_gaokao_bench_question_types:
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os.path as osp
|
|
3
|
+
|
|
4
|
+
from datasets import Dataset, DatasetDict
|
|
5
|
+
|
|
6
|
+
from opencompass.registry import LOAD_DATASET
|
|
7
|
+
|
|
8
|
+
from .base import BaseDataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@LOAD_DATASET.register_module()
|
|
12
|
+
class MMLUArabicDataset(BaseDataset):
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def load(path: str, name: str):
|
|
16
|
+
dataset = DatasetDict()
|
|
17
|
+
for split in ['dev', 'test']:
|
|
18
|
+
raw_data = []
|
|
19
|
+
filename = osp.join(path, split, f'{name}_{split}.csv')
|
|
20
|
+
with open(filename, encoding='utf-8') as f:
|
|
21
|
+
reader = csv.reader(f)
|
|
22
|
+
for row in reader:
|
|
23
|
+
assert len(row) == 6
|
|
24
|
+
raw_data.append({
|
|
25
|
+
'input': row[0],
|
|
26
|
+
'A': row[1],
|
|
27
|
+
'B': row[2],
|
|
28
|
+
'C': row[3],
|
|
29
|
+
'D': row[4],
|
|
30
|
+
'target': row[5],
|
|
31
|
+
})
|
|
32
|
+
dataset[split] = Dataset.from_list(raw_data)
|
|
33
|
+
return dataset
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from datasets import Dataset
|
|
4
|
+
|
|
5
|
+
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
6
|
+
from opencompass.registry import LOAD_DATASET
|
|
7
|
+
|
|
8
|
+
from .base import BaseDataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@LOAD_DATASET.register_module()
|
|
12
|
+
class QuALITYDataset(BaseDataset):
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def load(path: str):
|
|
16
|
+
dataset_list = []
|
|
17
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
18
|
+
for line in f:
|
|
19
|
+
line = json.loads(line)
|
|
20
|
+
for question in line['questions']:
|
|
21
|
+
dataset_list.append({
|
|
22
|
+
'article':
|
|
23
|
+
line['article'],
|
|
24
|
+
'question':
|
|
25
|
+
question['question'],
|
|
26
|
+
'A':
|
|
27
|
+
question['options'][0],
|
|
28
|
+
'B':
|
|
29
|
+
question['options'][1],
|
|
30
|
+
'C':
|
|
31
|
+
question['options'][2],
|
|
32
|
+
'D':
|
|
33
|
+
question['options'][3],
|
|
34
|
+
'gold_label':
|
|
35
|
+
'ABCD'[question['gold_label'] - 1],
|
|
36
|
+
'difficult':
|
|
37
|
+
question['difficult']
|
|
38
|
+
})
|
|
39
|
+
return Dataset.from_list(dataset_list)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class QuALITYEvaluator(BaseEvaluator):
|
|
43
|
+
|
|
44
|
+
def score(self, predictions, references, test_set):
|
|
45
|
+
assert len(predictions) == len(references)
|
|
46
|
+
easy, hard, all = [], [], []
|
|
47
|
+
for pred, refer, test in zip(predictions, references, test_set):
|
|
48
|
+
if pred == refer:
|
|
49
|
+
answer = True
|
|
50
|
+
else:
|
|
51
|
+
answer = False
|
|
52
|
+
all.append(answer)
|
|
53
|
+
if test['difficult'] == 0:
|
|
54
|
+
easy.append(answer)
|
|
55
|
+
else:
|
|
56
|
+
hard.append(answer)
|
|
57
|
+
return dict(easy_acc=sum(easy) / len(easy) * 100,
|
|
58
|
+
hard_acc=sum(hard) / len(easy) * 100,
|
|
59
|
+
all_acc=sum(all) / len(all) * 100)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from datasets import Dataset, DatasetDict
|
|
5
|
+
|
|
6
|
+
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS, ICL_EVALUATORS
|
|
7
|
+
|
|
8
|
+
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
9
|
+
from ..base import BaseDataset
|
|
10
|
+
from . import utils
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@LOAD_DATASET.register_module()
|
|
15
|
+
class TheoremQADatasetV3(BaseDataset):
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def load(path: str):
|
|
19
|
+
with open(path, 'r') as f:
|
|
20
|
+
data = json.load(f)
|
|
21
|
+
for item in data:
|
|
22
|
+
item['Answer'] = str(item['Answer'])
|
|
23
|
+
dataset = Dataset.from_list(data)
|
|
24
|
+
return dataset
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def TheoremQA_postprocess_v3(text: str) -> str:
|
|
28
|
+
answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text)
|
|
29
|
+
return answer
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@ICL_EVALUATORS.register_module()
|
|
33
|
+
class TheoremQAEvaluatorV3(BaseEvaluator):
|
|
34
|
+
def score(self, predictions, references, test_set):
|
|
35
|
+
if len(predictions) != len(references):
|
|
36
|
+
return {"error": "preds and refrs have different length"}
|
|
37
|
+
|
|
38
|
+
details = []
|
|
39
|
+
correct, wrong = 0, 0
|
|
40
|
+
for index in tqdm(range(len(predictions))):
|
|
41
|
+
answer = predictions[index]
|
|
42
|
+
groundtruth = references[index]
|
|
43
|
+
answer_type = test_set[index]['Answer_type']
|
|
44
|
+
if answer_type in ['float', 'integer', 'bool']:
|
|
45
|
+
groundtruth = [groundtruth, eval(groundtruth)]
|
|
46
|
+
else:
|
|
47
|
+
groundtruth = [groundtruth, None]
|
|
48
|
+
if utils.compare_answer_with_groundtruth(answer, *groundtruth):
|
|
49
|
+
correct += 1
|
|
50
|
+
is_correct = True
|
|
51
|
+
else:
|
|
52
|
+
wrong += 1
|
|
53
|
+
is_correct = False
|
|
54
|
+
|
|
55
|
+
details.append(
|
|
56
|
+
{
|
|
57
|
+
# "question": question,
|
|
58
|
+
# "solution": output,
|
|
59
|
+
"correct": groundtruth,
|
|
60
|
+
"pred": answer,
|
|
61
|
+
"is_correct": is_correct,
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
score = correct / (correct + wrong) * 100
|
|
66
|
+
return {'score': score, 'details': details}
|