opencompass 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opencompass-0.2.3 → opencompass-0.2.4}/PKG-INFO +57 -9
- {opencompass-0.2.3 → opencompass-0.2.4}/README.md +56 -8
- opencompass-0.2.4/opencompass/__init__.py +1 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/cmp_GCP_D.py +5 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/cmp_TSP_D.py +5 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/p_SPP.py +5 -1
- opencompass-0.2.4/opencompass/datasets/QuALITY.py +59 -0
- opencompass-0.2.4/opencompass/datasets/TheoremQA/__init__.py +4 -0
- opencompass-0.2.3/opencompass/datasets/TheoremQA.py → opencompass-0.2.4/opencompass/datasets/TheoremQA/legacy.py +1 -1
- opencompass-0.2.4/opencompass/datasets/TheoremQA/main.py +66 -0
- opencompass-0.2.4/opencompass/datasets/TheoremQA/number_utils.py +98 -0
- opencompass-0.2.4/opencompass/datasets/TheoremQA/utils.py +110 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/__init__.py +4 -0
- opencompass-0.2.4/opencompass/datasets/apps.py +877 -0
- opencompass-0.2.4/opencompass/datasets/chembench.py +34 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/custom.py +10 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/humanevalx.py +4 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/math.py +19 -6
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/mathbench.py +1 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/mbpp.py +41 -36
- opencompass-0.2.4/opencompass/datasets/taco.py +823 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/__init__.py +6 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/ai360_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/baichuan_api.py +128 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/baidu_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/base.py +2 -2
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/base_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/bytedance_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/claude_api/claude_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/gemini_api.py +6 -6
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/huggingface.py +1 -1
- opencompass-0.2.4/opencompass/models/hunyuan_api.py +121 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/llama2.py +1 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/lmdeploy_pytorch.py +2 -2
- opencompass-0.2.4/opencompass/models/lmdeploy_tis.py +200 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/minimax_api.py +4 -4
- opencompass-0.2.4/opencompass/models/mistral_api.py +123 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/moonshot_api.py +24 -26
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/nanbeige_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/openai_api.py +41 -17
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/pangu_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/qwen_api.py +28 -14
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/sensetime_api.py +14 -9
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/turbomind.py +36 -8
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/turbomind_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/turbomind_tis.py +10 -4
- opencompass-0.2.4/opencompass/models/unigpt_api.py +147 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/vllm.py +6 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/xunfei_api.py +4 -4
- opencompass-0.2.4/opencompass/models/yayi_api.py +261 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/zhipuai_api.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/zhipuai_v2_api.py +12 -6
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/lm_evaluator.py +57 -23
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +2 -14
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_prompt_template.py +4 -4
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/partitioners/base.py +18 -7
- opencompass-0.2.4/opencompass/partitioners/sub_naive.py +220 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/partitioners/sub_size.py +29 -6
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/registry.py +15 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/runners/base.py +2 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/runners/dlc.py +37 -10
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/runners/local.py +21 -7
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/default.py +1 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/needlebench.py +234 -173
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/alignmentbench.py +43 -33
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/alpacaeval.py +2 -1
- opencompass-0.2.4/opencompass/summarizers/subjective/compass_arena.py +240 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/mtbench.py +55 -48
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/multiround.py +2 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/tasks/openicl_eval.py +3 -2
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/tasks/openicl_infer.py +12 -5
- opencompass-0.2.4/opencompass/tasks/subjective_eval.py +438 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/abbr.py +22 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/build.py +1 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/prompt.py +5 -5
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/run.py +140 -12
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/text_postprocessors.py +5 -5
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass.egg-info/PKG-INFO +57 -9
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass.egg-info/SOURCES.txt +15 -1
- opencompass-0.2.4/opencompass.egg-info/entry_points.txt +3 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass.egg-info/requires.txt +2 -1
- {opencompass-0.2.3 → opencompass-0.2.4}/setup.py +40 -33
- opencompass-0.2.3/opencompass/__init__.py +0 -1
- opencompass-0.2.3/opencompass/partitioners/sub_naive.py +0 -110
- opencompass-0.2.3/opencompass/summarizers/subjective/compass_arena.py +0 -204
- opencompass-0.2.3/opencompass/tasks/subjective_eval.py +0 -282
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/FinanceIQ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/GaokaoBench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/cmp_KSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/hard_GCP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/hard_MSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/hard_TSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/p_BSP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/p_EDP.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/prompts.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/NPHardEval/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/OpenFinData.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/advglue.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/afqmcd.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/agieval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/constructions.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/dataset_loader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/evaluation.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/math_equivalence.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/post_process.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/agieval/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/anli.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/anthropics_evals.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/arc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/ax.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/base.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/bbh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/boolq.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/bustum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/c3.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cb.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/ceval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/chid.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cibench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/circular.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/civilcomments.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/clozeTest_maxmin.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cluewsc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cmb.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cmmlu.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cmnli.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cmrc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/commonsenseqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/commonsenseqa_cn.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/copa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/crowspairs.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/crowspairs_cn.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/csl.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/cvalues.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/drcd.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/drop.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/ds1000.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/ds1000_interpreter.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/eprstmt.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/flores.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/game24.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/govrepcrs.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/gpqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/gsm8k.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/gsm_hard.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/hellaswag.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/huggingface.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/humaneval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/humaneval_multi.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/hungarian_math.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_codedebug.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_coderun.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_endia.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_enmc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_enqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_ensum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_mathfind.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/infinitebench_zhqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/infinitebench/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/iwslt2017.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/jigsawmultilingual.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/jsonl.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/kaoshi.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lambada.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lawbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lawbench/lawbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lcsts.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/evaluators.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_coursera.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_financial_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_gov_report_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_gsm100.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_legal_contract_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_meeting_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_multidoc_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_narrattive_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_natural_question.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_news_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_paper_assistant.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_patent_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_quality.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_review_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_scientific_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_topic_retrieval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_tpo.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/leval/leval_tvshow_summ.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lmeval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/evaluators.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_2wikim_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_dureader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_gov_report.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_hotpot_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_lcc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_lsht.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_multi_news.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_multifieldqa_en.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_musique.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_narrative_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_passage_count.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_qasper.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_qmsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_repobench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_samsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_trec.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_trivia_qa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/longbench/longbench_vcsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/evaluators.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_cmrc_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_dureader_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_factrecall_en.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_factrecall_zh.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_lic_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/mastermath2024v1.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/math401.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/math_intern.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/constructions.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/dataset_loader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/evaluation.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/math_equivalence.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/medbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/post_process.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/medbench/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/mmlu.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/multirc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/narrativeqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/natural_question.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/natural_question_cn.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/obqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/piqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/py150.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/qasper.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/qaspercut.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/race.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/realtoxicprompts.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/reasonbench/ReasonBenchDataset.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/reasonbench/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/record.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/rolebench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/safety.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/scibench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/siqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/squad20.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/storycloze.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/strategyqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/alignbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/compass_arena.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/corev2.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/creationbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/information_retrival.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/mtbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/multiround.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/subjective/subjective_cmp.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/summedits.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/summscreen.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/svamp.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/tabmwp.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/instruct_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/planning_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/evaluators/review_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/schema.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/utils/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/utils/convert_results.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/utils/format_load.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/utils/meta_template.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/teval/utils/template.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/tnews.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/triviaqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/triviaqarc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/truthfulqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/tydiqa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/wic.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/wikibench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/winograd.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/winogrande.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/wnli.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/wsc.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/xcopa.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/xiezhi.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/xlsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/datasets/xsum.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/metrics/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/metrics/dump_results.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/metrics/mme_score.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/metrics/seedbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/accessory.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/alaya.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/claude_api/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/claude_api/postprocessors.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/glm.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/intern_model.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/krgpt_api.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/lagent.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/langchain.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/lightllm_api.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/mixtral.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/models/modelscope.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_dataset_reader.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_agent_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_circular_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_jieba_rouge_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_attack_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_mink_percent_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_sc_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_inferencer/icl_tot_inferencer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_base_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_bm25_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_dpp_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_fix_k_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_random_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_topk_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_votek_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/icl_retriever/icl_zero_retriever.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/utils/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/openicl/utils/logging.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/partitioners/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/partitioners/mm_naive.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/partitioners/naive.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/partitioners/num_worker.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/partitioners/size.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/runners/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/runners/local_api.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/runners/slurm.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/runners/slurm_sequential.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/circular.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/multi_model.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/corev2.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/creationbench.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/information_retrival.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/subjective_post_process.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/subjective/utils.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/summarizers/summarizer_pretrain.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/tasks/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/tasks/base.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/tasks/llm_eval.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/tasks/mm_infer.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/tasks/openicl_attack.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/__init__.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/auxiliary.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/collect_env.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/dependency.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/file.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/fileio.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/lark.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/logging.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/menu.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass/utils/types.py +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass.egg-info/dependency_links.txt +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/opencompass.egg-info/top_level.txt +0 -0
- {opencompass-0.2.3 → opencompass-0.2.4}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: opencompass
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A comprehensive toolkit for large model evaluation
|
|
5
5
|
Home-page: https://github.com/open-compass/opencompass
|
|
6
6
|
Author: OpenCompass Contributors
|
|
@@ -11,8 +11,13 @@ Description: <div align="center">
|
|
|
11
11
|
<br />
|
|
12
12
|
<br />
|
|
13
13
|
|
|
14
|
-
[![
|
|
15
|
-
[![
|
|
14
|
+
[![][github-release-shield]][github-release-link]
|
|
15
|
+
[![][github-releasedate-shield]][github-releasedate-link]
|
|
16
|
+
[![][github-contributors-shield]][github-contributors-link]<br>
|
|
17
|
+
[![][github-forks-shield]][github-forks-link]
|
|
18
|
+
[![][github-stars-shield]][github-stars-link]
|
|
19
|
+
[![][github-issues-shield]][github-issues-link]
|
|
20
|
+
[![][github-license-shield]][github-license-link]
|
|
16
21
|
|
|
17
22
|
<!-- [](https://pypi.org/project/opencompass/) -->
|
|
18
23
|
|
|
@@ -25,12 +30,18 @@ Description: <div align="center">
|
|
|
25
30
|
|
|
26
31
|
English | [简体中文](README_zh-CN.md)
|
|
27
32
|
|
|
33
|
+
[![][github-trending-shield]][github-trending-url]
|
|
34
|
+
|
|
28
35
|
</div>
|
|
29
36
|
|
|
30
37
|
<p align="center">
|
|
31
38
|
👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
|
|
32
39
|
</p>
|
|
33
40
|
|
|
41
|
+
> \[!IMPORTANT\]
|
|
42
|
+
>
|
|
43
|
+
> **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
|
|
44
|
+
|
|
34
45
|
## 📣 OpenCompass 2.0
|
|
35
46
|
|
|
36
47
|
We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
|
|
@@ -42,6 +53,14 @@ Description: <div align="center">
|
|
|
42
53
|
|
|
43
54
|
**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
|
|
44
55
|
|
|
56
|
+
<details>
|
|
57
|
+
<summary><kbd>Star History</kbd></summary>
|
|
58
|
+
<picture>
|
|
59
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
|
|
60
|
+
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
|
|
61
|
+
</picture>
|
|
62
|
+
</details>
|
|
63
|
+
|
|
45
64
|
## 🧭 Welcome
|
|
46
65
|
|
|
47
66
|
to **OpenCompass**!
|
|
@@ -59,12 +78,9 @@ Description: <div align="center">
|
|
|
59
78
|
|
|
60
79
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
61
80
|
|
|
62
|
-
- **\[2024.
|
|
63
|
-
- **\[2024.
|
|
64
|
-
- **\[2024.01.
|
|
65
|
-
- **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
|
|
66
|
-
- **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development.
|
|
67
|
-
- **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details!
|
|
81
|
+
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
82
|
+
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
83
|
+
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
68
84
|
|
|
69
85
|
> [More](docs/en/notes/news.md)
|
|
70
86
|
|
|
@@ -447,6 +463,7 @@ Description: <div align="center">
|
|
|
447
463
|
|
|
448
464
|
- [InternLM](https://github.com/InternLM/InternLM)
|
|
449
465
|
- [LLaMA](https://github.com/facebookresearch/llama)
|
|
466
|
+
- [LLaMA3](https://github.com/meta-llama/llama3)
|
|
450
467
|
- [Vicuna](https://github.com/lm-sys/FastChat)
|
|
451
468
|
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
|
452
469
|
- [Baichuan](https://github.com/baichuan-inc)
|
|
@@ -505,6 +522,20 @@ Description: <div align="center">
|
|
|
505
522
|
|
|
506
523
|
We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
|
|
507
524
|
|
|
525
|
+
<!-- Copy-paste in your Readme.md file -->
|
|
526
|
+
|
|
527
|
+
<!-- Made with [OSS Insight](https://ossinsight.io/) -->
|
|
528
|
+
|
|
529
|
+
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
|
|
530
|
+
<table>
|
|
531
|
+
<tr>
|
|
532
|
+
<th colspan="2">
|
|
533
|
+
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
|
|
534
|
+
</th>
|
|
535
|
+
</tr>
|
|
536
|
+
</table>
|
|
537
|
+
</a>
|
|
538
|
+
|
|
508
539
|
## 🤝 Acknowledgements
|
|
509
540
|
|
|
510
541
|
Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
|
|
@@ -524,6 +555,23 @@ Description: <div align="center">
|
|
|
524
555
|
|
|
525
556
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
|
526
557
|
|
|
558
|
+
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
|
|
559
|
+
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
|
|
560
|
+
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
|
|
561
|
+
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
|
|
562
|
+
[github-issues-link]: https://github.com/open-compass/opencompass/issues
|
|
563
|
+
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
|
|
564
|
+
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
|
|
565
|
+
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
|
|
566
|
+
[github-release-link]: https://github.com/open-compass/opencompass/releases
|
|
567
|
+
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
|
|
568
|
+
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
|
|
569
|
+
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
|
|
570
|
+
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
|
|
571
|
+
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
|
|
572
|
+
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
|
|
573
|
+
[github-trending-url]: https://trendshift.io/repositories/6630
|
|
574
|
+
|
|
527
575
|
Keywords: AI,NLP,in-context learning,large language model,evaluation,benchmark,llm
|
|
528
576
|
Platform: UNKNOWN
|
|
529
577
|
Classifier: Programming Language :: Python :: 3.8
|
|
@@ -3,8 +3,13 @@
|
|
|
3
3
|
<br />
|
|
4
4
|
<br />
|
|
5
5
|
|
|
6
|
-
[![
|
|
7
|
-
[![
|
|
6
|
+
[![][github-release-shield]][github-release-link]
|
|
7
|
+
[![][github-releasedate-shield]][github-releasedate-link]
|
|
8
|
+
[![][github-contributors-shield]][github-contributors-link]<br>
|
|
9
|
+
[![][github-forks-shield]][github-forks-link]
|
|
10
|
+
[![][github-stars-shield]][github-stars-link]
|
|
11
|
+
[![][github-issues-shield]][github-issues-link]
|
|
12
|
+
[![][github-license-shield]][github-license-link]
|
|
8
13
|
|
|
9
14
|
<!-- [](https://pypi.org/project/opencompass/) -->
|
|
10
15
|
|
|
@@ -17,12 +22,18 @@
|
|
|
17
22
|
|
|
18
23
|
English | [简体中文](README_zh-CN.md)
|
|
19
24
|
|
|
25
|
+
[![][github-trending-shield]][github-trending-url]
|
|
26
|
+
|
|
20
27
|
</div>
|
|
21
28
|
|
|
22
29
|
<p align="center">
|
|
23
30
|
👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
|
|
24
31
|
</p>
|
|
25
32
|
|
|
33
|
+
> \[!IMPORTANT\]
|
|
34
|
+
>
|
|
35
|
+
> **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
|
|
36
|
+
|
|
26
37
|
## 📣 OpenCompass 2.0
|
|
27
38
|
|
|
28
39
|
We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
|
|
@@ -34,6 +45,14 @@ We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three
|
|
|
34
45
|
|
|
35
46
|
**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
|
|
36
47
|
|
|
48
|
+
<details>
|
|
49
|
+
<summary><kbd>Star History</kbd></summary>
|
|
50
|
+
<picture>
|
|
51
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
|
|
52
|
+
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
|
|
53
|
+
</picture>
|
|
54
|
+
</details>
|
|
55
|
+
|
|
37
56
|
## 🧭 Welcome
|
|
38
57
|
|
|
39
58
|
to **OpenCompass**!
|
|
@@ -51,12 +70,9 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
|
|
51
70
|
|
|
52
71
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
53
72
|
|
|
54
|
-
- **\[2024.
|
|
55
|
-
- **\[2024.
|
|
56
|
-
- **\[2024.01.
|
|
57
|
-
- **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
|
|
58
|
-
- **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development.
|
|
59
|
-
- **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details!
|
|
73
|
+
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
74
|
+
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
75
|
+
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
60
76
|
|
|
61
77
|
> [More](docs/en/notes/news.md)
|
|
62
78
|
|
|
@@ -439,6 +455,7 @@ Through the command line or configuration files, OpenCompass also supports evalu
|
|
|
439
455
|
|
|
440
456
|
- [InternLM](https://github.com/InternLM/InternLM)
|
|
441
457
|
- [LLaMA](https://github.com/facebookresearch/llama)
|
|
458
|
+
- [LLaMA3](https://github.com/meta-llama/llama3)
|
|
442
459
|
- [Vicuna](https://github.com/lm-sys/FastChat)
|
|
443
460
|
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
|
444
461
|
- [Baichuan](https://github.com/baichuan-inc)
|
|
@@ -497,6 +514,20 @@ Through the command line or configuration files, OpenCompass also supports evalu
|
|
|
497
514
|
|
|
498
515
|
We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
|
|
499
516
|
|
|
517
|
+
<!-- Copy-paste in your Readme.md file -->
|
|
518
|
+
|
|
519
|
+
<!-- Made with [OSS Insight](https://ossinsight.io/) -->
|
|
520
|
+
|
|
521
|
+
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
|
|
522
|
+
<table>
|
|
523
|
+
<tr>
|
|
524
|
+
<th colspan="2">
|
|
525
|
+
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
|
|
526
|
+
</th>
|
|
527
|
+
</tr>
|
|
528
|
+
</table>
|
|
529
|
+
</a>
|
|
530
|
+
|
|
500
531
|
## 🤝 Acknowledgements
|
|
501
532
|
|
|
502
533
|
Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
|
|
@@ -515,3 +546,20 @@ Some datasets and prompt implementations are modified from [chain-of-thought-hub
|
|
|
515
546
|
```
|
|
516
547
|
|
|
517
548
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
|
549
|
+
|
|
550
|
+
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
|
|
551
|
+
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
|
|
552
|
+
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
|
|
553
|
+
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
|
|
554
|
+
[github-issues-link]: https://github.com/open-compass/opencompass/issues
|
|
555
|
+
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
|
|
556
|
+
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
|
|
557
|
+
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
|
|
558
|
+
[github-release-link]: https://github.com/open-compass/opencompass/releases
|
|
559
|
+
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
|
|
560
|
+
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
|
|
561
|
+
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
|
|
562
|
+
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
|
|
563
|
+
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
|
|
564
|
+
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
|
|
565
|
+
[github-trending-url]: https://trendshift.io/repositories/6630
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.2.4'
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from datasets import Dataset
|
|
4
|
+
|
|
5
|
+
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
6
|
+
from opencompass.registry import LOAD_DATASET
|
|
7
|
+
|
|
8
|
+
from .base import BaseDataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@LOAD_DATASET.register_module()
|
|
12
|
+
class QuALITYDataset(BaseDataset):
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def load(path: str):
|
|
16
|
+
dataset_list = []
|
|
17
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
18
|
+
for line in f:
|
|
19
|
+
line = json.loads(line)
|
|
20
|
+
for question in line['questions']:
|
|
21
|
+
dataset_list.append({
|
|
22
|
+
'article':
|
|
23
|
+
line['article'],
|
|
24
|
+
'question':
|
|
25
|
+
question['question'],
|
|
26
|
+
'A':
|
|
27
|
+
question['options'][0],
|
|
28
|
+
'B':
|
|
29
|
+
question['options'][1],
|
|
30
|
+
'C':
|
|
31
|
+
question['options'][2],
|
|
32
|
+
'D':
|
|
33
|
+
question['options'][3],
|
|
34
|
+
'gold_label':
|
|
35
|
+
'ABCD'[question['gold_label'] - 1],
|
|
36
|
+
'difficult':
|
|
37
|
+
question['difficult']
|
|
38
|
+
})
|
|
39
|
+
return Dataset.from_list(dataset_list)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class QuALITYEvaluator(BaseEvaluator):
|
|
43
|
+
|
|
44
|
+
def score(self, predictions, references, test_set):
|
|
45
|
+
assert len(predictions) == len(references)
|
|
46
|
+
easy, hard, all = [], [], []
|
|
47
|
+
for pred, refer, test in zip(predictions, references, test_set):
|
|
48
|
+
if pred == refer:
|
|
49
|
+
answer = True
|
|
50
|
+
else:
|
|
51
|
+
answer = False
|
|
52
|
+
all.append(answer)
|
|
53
|
+
if test['difficult'] == 0:
|
|
54
|
+
easy.append(answer)
|
|
55
|
+
else:
|
|
56
|
+
hard.append(answer)
|
|
57
|
+
return dict(easy_acc=sum(easy) / len(easy) * 100,
|
|
58
|
+
hard_acc=sum(hard) / len(easy) * 100,
|
|
59
|
+
all_acc=sum(all) / len(all) * 100)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from datasets import Dataset, DatasetDict
|
|
5
|
+
|
|
6
|
+
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS, ICL_EVALUATORS
|
|
7
|
+
|
|
8
|
+
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
9
|
+
from ..base import BaseDataset
|
|
10
|
+
from . import utils
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@LOAD_DATASET.register_module()
|
|
15
|
+
class TheoremQADatasetV3(BaseDataset):
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def load(path: str):
|
|
19
|
+
with open(path, 'r') as f:
|
|
20
|
+
data = json.load(f)
|
|
21
|
+
for item in data:
|
|
22
|
+
item['Answer'] = str(item['Answer'])
|
|
23
|
+
dataset = Dataset.from_list(data)
|
|
24
|
+
return dataset
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def TheoremQA_postprocess_v3(text: str) -> str:
|
|
28
|
+
answer = utils.answer_clean(["The answer is:", "The answer is", "the answer is"], text)
|
|
29
|
+
return answer
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@ICL_EVALUATORS.register_module()
|
|
33
|
+
class TheoremQAEvaluatorV3(BaseEvaluator):
|
|
34
|
+
def score(self, predictions, references, test_set):
|
|
35
|
+
if len(predictions) != len(references):
|
|
36
|
+
return {"error": "preds and refrs have different length"}
|
|
37
|
+
|
|
38
|
+
details = []
|
|
39
|
+
correct, wrong = 0, 0
|
|
40
|
+
for index in tqdm(range(len(predictions))):
|
|
41
|
+
answer = predictions[index]
|
|
42
|
+
groundtruth = references[index]
|
|
43
|
+
answer_type = test_set[index]['Answer_type']
|
|
44
|
+
if answer_type in ['float', 'integer', 'bool']:
|
|
45
|
+
groundtruth = [groundtruth, eval(groundtruth)]
|
|
46
|
+
else:
|
|
47
|
+
groundtruth = [groundtruth, None]
|
|
48
|
+
if utils.compare_answer_with_groundtruth(answer, *groundtruth):
|
|
49
|
+
correct += 1
|
|
50
|
+
is_correct = True
|
|
51
|
+
else:
|
|
52
|
+
wrong += 1
|
|
53
|
+
is_correct = False
|
|
54
|
+
|
|
55
|
+
details.append(
|
|
56
|
+
{
|
|
57
|
+
# "question": question,
|
|
58
|
+
# "solution": output,
|
|
59
|
+
"correct": groundtruth,
|
|
60
|
+
"pred": answer,
|
|
61
|
+
"is_correct": is_correct,
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
score = correct / (correct + wrong) * 100
|
|
66
|
+
return {'score': score, 'details': details}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import math
|
|
3
|
+
from math import sqrt, sin, cos, log, pi, factorial, exp, e
|
|
4
|
+
E = 2.718
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def floatify(num: str):
|
|
8
|
+
try:
|
|
9
|
+
num = float(num)
|
|
10
|
+
if num.is_integer():
|
|
11
|
+
return round(num)
|
|
12
|
+
else:
|
|
13
|
+
return num
|
|
14
|
+
except Exception:
|
|
15
|
+
return None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def within_eps(pred: float, gt: float):
|
|
19
|
+
eps = abs(gt) * 0.04
|
|
20
|
+
if pred >= gt - eps and pred <= gt + eps:
|
|
21
|
+
return True
|
|
22
|
+
else:
|
|
23
|
+
return False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def clean_units(pred_str: str):
|
|
27
|
+
"""Clean the units in the number."""
|
|
28
|
+
def convert_pi_to_number(code_string):
|
|
29
|
+
code_string = code_string.replace('\\pi', 'π')
|
|
30
|
+
# Replace \pi or π not preceded by a digit or } with 3.14
|
|
31
|
+
code_string = re.sub(r'(?<![\d}])\\?π', '3.14', code_string)
|
|
32
|
+
# Replace instances where π is preceded by a digit but without a multiplication symbol, e.g., "3π" -> "3*3.14"
|
|
33
|
+
code_string = re.sub(r'(\d)(\\?π)', r'\1*3.14', code_string)
|
|
34
|
+
# Handle cases where π is within braces or followed by a multiplication symbol
|
|
35
|
+
# This replaces "{π}" with "3.14" directly and "3*π" with "3*3.14"
|
|
36
|
+
code_string = re.sub(r'\{(\\?π)\}', '3.14', code_string)
|
|
37
|
+
code_string = re.sub(r'\*(\\?π)', '*3.14', code_string)
|
|
38
|
+
return code_string
|
|
39
|
+
|
|
40
|
+
pred_str = convert_pi_to_number(pred_str)
|
|
41
|
+
pred_str = pred_str.replace('%', '/100')
|
|
42
|
+
pred_str = pred_str.replace('$', '')
|
|
43
|
+
pred_str = pred_str.replace('¥', '')
|
|
44
|
+
pred_str = pred_str.replace('°C', '')
|
|
45
|
+
pred_str = pred_str.replace(' C', '')
|
|
46
|
+
pred_str = pred_str.replace('°', '')
|
|
47
|
+
return pred_str
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def number_it(num):
|
|
51
|
+
from latex2sympy2 import latex2sympy
|
|
52
|
+
if isinstance(num, (int, float)):
|
|
53
|
+
return num
|
|
54
|
+
|
|
55
|
+
num = clean_units(num)
|
|
56
|
+
try:
|
|
57
|
+
num = str(latex2sympy(num))
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
if floatify(num) is not None:
|
|
62
|
+
return floatify(num)
|
|
63
|
+
else:
|
|
64
|
+
try:
|
|
65
|
+
num = eval(num)
|
|
66
|
+
if isinstance(num, list) or isinstance(num, tuple):
|
|
67
|
+
num = num[0]
|
|
68
|
+
if floatify(num) is not None:
|
|
69
|
+
return floatify(num)
|
|
70
|
+
else:
|
|
71
|
+
return None
|
|
72
|
+
except Exception:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def compare_two_numbers(p, gt):
|
|
77
|
+
try:
|
|
78
|
+
if math.isnan(p):
|
|
79
|
+
return False
|
|
80
|
+
if isinstance(gt, int):
|
|
81
|
+
return round(p) == gt
|
|
82
|
+
else:
|
|
83
|
+
return within_eps(pred=p, gt=gt)
|
|
84
|
+
except Exception:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def compare_two_list(pred, gt):
|
|
89
|
+
if not isinstance(pred, list):
|
|
90
|
+
return False
|
|
91
|
+
elif len(pred) != len(gt):
|
|
92
|
+
return False
|
|
93
|
+
elif any([not isinstance(x, (int, float)) for x in pred]):
|
|
94
|
+
return False
|
|
95
|
+
else:
|
|
96
|
+
pred = sorted(pred)
|
|
97
|
+
gt = sorted(gt)
|
|
98
|
+
return all([compare_two_numbers(p, g) for p, g in zip(pred, gt)])
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from .number_utils import clean_units, compare_two_numbers, compare_two_list, number_it
|
|
3
|
+
import contextlib
|
|
4
|
+
import signal
|
|
5
|
+
|
|
6
|
+
@contextlib.contextmanager
|
|
7
|
+
def time_limit(seconds: float):
|
|
8
|
+
def signal_handler(signum, frame):
|
|
9
|
+
raise ValueError
|
|
10
|
+
|
|
11
|
+
signal.setitimer(signal.ITIMER_REAL, seconds)
|
|
12
|
+
signal.signal(signal.SIGALRM, signal_handler)
|
|
13
|
+
try:
|
|
14
|
+
yield
|
|
15
|
+
finally:
|
|
16
|
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
|
|
20
|
+
from latex2sympy2 import latex2sympy
|
|
21
|
+
|
|
22
|
+
if any([option in pred.lower() for option in ['yes', 'true']]):
|
|
23
|
+
pred = 'True'
|
|
24
|
+
elif any([option in pred.lower() for option in ['no', 'false']]):
|
|
25
|
+
pred = 'False'
|
|
26
|
+
elif any([option in pred.lower() for option in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']]):
|
|
27
|
+
pass
|
|
28
|
+
else:
|
|
29
|
+
if answer_flag:
|
|
30
|
+
# Extract the numbers out of the string
|
|
31
|
+
pred = pred.split('=')[-1].strip()
|
|
32
|
+
pred = clean_units(pred)
|
|
33
|
+
try:
|
|
34
|
+
with time_limit(1):
|
|
35
|
+
tmp = str(latex2sympy(pred))
|
|
36
|
+
pred = str(eval(tmp))
|
|
37
|
+
except Exception:
|
|
38
|
+
if re.match(r'-?[\d\.]+\s\D+$', pred):
|
|
39
|
+
pred = pred.split(' ')[0]
|
|
40
|
+
elif re.match(r'-?[\d\.]+\s[^\s]+$', pred):
|
|
41
|
+
pred = pred.split(' ')[0]
|
|
42
|
+
else:
|
|
43
|
+
# desparate search over the last number
|
|
44
|
+
preds = re.findall(r'-?\d*\.?\d+', pred)
|
|
45
|
+
if(len(preds) >= 1):
|
|
46
|
+
pred = preds[-1]
|
|
47
|
+
else:
|
|
48
|
+
pred = ''
|
|
49
|
+
return pred
|
|
50
|
+
|
|
51
|
+
def answer_clean(direct_answer_trigger_for_fewshot: tuple, pred: str):
|
|
52
|
+
pred = pred.strip('\n')
|
|
53
|
+
|
|
54
|
+
# Determine if this is ICL, if so, use \n\n to split the first chunk.
|
|
55
|
+
ICL = False
|
|
56
|
+
for trigger in direct_answer_trigger_for_fewshot:
|
|
57
|
+
if pred.count(trigger) > 1:
|
|
58
|
+
ICL = True
|
|
59
|
+
if ICL:
|
|
60
|
+
pred = pred.split('\n\n')[0]
|
|
61
|
+
|
|
62
|
+
# Split the trigger to find the answer.
|
|
63
|
+
preds = re.split('|'.join(direct_answer_trigger_for_fewshot), pred)
|
|
64
|
+
if len(preds) > 1:
|
|
65
|
+
answer_flag = True
|
|
66
|
+
pred = preds[-1]
|
|
67
|
+
else:
|
|
68
|
+
answer_flag = False
|
|
69
|
+
|
|
70
|
+
pred = pred.strip('\n').rstrip('.').rstrip('/').strip(' ')
|
|
71
|
+
|
|
72
|
+
pred = [extract_theoremqa_answer(pred, answer_flag)]
|
|
73
|
+
|
|
74
|
+
# If there is no candidate in list, null is set.
|
|
75
|
+
if len(pred) == 0:
|
|
76
|
+
pred = ""
|
|
77
|
+
else:
|
|
78
|
+
if answer_flag:
|
|
79
|
+
# choose the first element in list ...
|
|
80
|
+
pred = pred[0]
|
|
81
|
+
else:
|
|
82
|
+
# choose the last e
|
|
83
|
+
pred = pred[-1]
|
|
84
|
+
|
|
85
|
+
# Remove the period at the end, again!
|
|
86
|
+
pred = pred.rstrip('.').rstrip('/')
|
|
87
|
+
return pred
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def compare_answer_with_groundtruth(answer: str, groundtruth_str: str, groundtruth_num = None):
|
|
92
|
+
if groundtruth_str.lower() in ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']:
|
|
93
|
+
return groundtruth_str.lower() in answer.lower()
|
|
94
|
+
elif answer.lower() == groundtruth_str.lower():
|
|
95
|
+
return True
|
|
96
|
+
elif groundtruth_num is not None:
|
|
97
|
+
if isinstance(groundtruth_num, (int, float)):
|
|
98
|
+
return compare_two_numbers(number_it(answer), groundtruth_num)
|
|
99
|
+
else:
|
|
100
|
+
if answer.startswith('(') and answer.endswith(')'):
|
|
101
|
+
try:
|
|
102
|
+
answer = list(eval(answer))
|
|
103
|
+
answer = [number_it(a) for a in answer]
|
|
104
|
+
except Exception as e:
|
|
105
|
+
return False
|
|
106
|
+
return compare_two_list(answer, groundtruth_num)
|
|
107
|
+
else:
|
|
108
|
+
return False
|
|
109
|
+
else:
|
|
110
|
+
return False
|