opencompass 0.2.4__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {opencompass-0.2.4 → opencompass-0.2.6}/PKG-INFO +20 -13
- {opencompass-0.2.4 → opencompass-0.2.6}/README.md +19 -12
- opencompass-0.2.6/opencompass/__init__.py +1 -0
- opencompass-0.2.6/opencompass/cli/main.py +383 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/GaokaoBench.py +23 -6
- opencompass-0.2.6/opencompass/datasets/IFEval/__init__.py +0 -0
- opencompass-0.2.6/opencompass/datasets/IFEval/evaluation_main.py +141 -0
- opencompass-0.2.6/opencompass/datasets/IFEval/ifeval.py +95 -0
- opencompass-0.2.6/opencompass/datasets/IFEval/instructions.py +1570 -0
- opencompass-0.2.6/opencompass/datasets/IFEval/instructions_registry.py +190 -0
- opencompass-0.2.6/opencompass/datasets/IFEval/instructions_util.py +145 -0
- opencompass-0.2.6/opencompass/datasets/MMLUArabic.py +33 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/__init__.py +7 -0
- opencompass-0.2.6/opencompass/datasets/benbench.py +88 -0
- opencompass-0.2.6/opencompass/datasets/charm.py +55 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cibench.py +178 -149
- opencompass-0.2.6/opencompass/datasets/drop_simple_eval.py +80 -0
- opencompass-0.2.6/opencompass/datasets/flames.py +57 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/gpqa.py +53 -1
- opencompass-0.2.6/opencompass/datasets/humaneval.py +173 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/__init__.py +1 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/cjft.py +19 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/flzx.py +18 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ftcs.py +19 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jdzy.py +36 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py +29 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py +29 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/jetq.py +43 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/lblj.py +29 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ljp_accusation.py +76 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ljp_article.py +70 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py +51 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/sjjc.py +64 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/wbfl.py +42 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/wsjd.py +52 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/xxcq.py +17 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/ydlj.py +17 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/yqzy.py +18 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/evaluation_functions/zxfl.py +27 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/__init__.py +1 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/char_smi.py +456 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/compare_m2_for_evaluation.py +433 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/comprehension_scores.py +82 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/function_utils.py +49 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/__init__.py +1 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/alignment.py +332 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/annotator.py +76 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/classifier.py +150 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/merger.py +273 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/tokenization.py +346 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/modules/tokenizer.py +91 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/parallel_to_m2.py +221 -0
- opencompass-0.2.6/opencompass/datasets/lawbench/utils/rc_f1.py +158 -0
- opencompass-0.2.6/opencompass/datasets/llm_compression.py +36 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/math.py +15 -0
- opencompass-0.2.6/opencompass/datasets/mathbench.py +381 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/mbpp.py +12 -8
- opencompass-0.2.6/opencompass/datasets/mgsm.py +78 -0
- opencompass-0.2.6/opencompass/datasets/mmlu_pro.py +31 -0
- opencompass-0.2.6/opencompass/datasets/needlebench/__init__.py +0 -0
- opencompass-0.2.6/opencompass/datasets/needlebench/atc.py +247 -0
- opencompass-0.2.6/opencompass/datasets/needlebench/atc_choice.py +169 -0
- opencompass-0.2.6/opencompass/datasets/needlebench/multi.py +257 -0
- opencompass-0.2.6/opencompass/datasets/needlebench/origin.py +277 -0
- opencompass-0.2.6/opencompass/datasets/needlebench/parallel.py +311 -0
- opencompass-0.2.6/opencompass/datasets/s3eval.py +169 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/__init__.py +7 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/alignbench.py +3 -15
- opencompass-0.2.6/opencompass/datasets/subjective/arena_hard.py +35 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/compass_arena.py +1 -5
- opencompass-0.2.6/opencompass/datasets/subjective/compassbench.py +102 -0
- opencompass-0.2.6/opencompass/datasets/subjective/compassbench_control_length_bias.py +130 -0
- opencompass-0.2.6/opencompass/datasets/subjective/fofo.py +36 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/mtbench.py +7 -2
- opencompass-0.2.6/opencompass/datasets/subjective/mtbench101.py +325 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/subjective_cmp.py +1 -1
- opencompass-0.2.6/opencompass/datasets/subjective/wildbench.py +249 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/taco.py +6 -4
- opencompass-0.2.6/opencompass/datasets/teval/utils/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/winogrande.py +9 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/__init__.py +23 -14
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/ai360_api.py +33 -25
- opencompass-0.2.6/opencompass/models/baichuan_api.py +179 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/baidu_api.py +28 -9
- opencompass-0.2.6/opencompass/models/deepseek_api.py +178 -0
- opencompass-0.2.4/opencompass/models/hunyuan_api.py → opencompass-0.2.6/opencompass/models/doubao.py +31 -42
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/gemini_api.py +0 -63
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/huggingface.py +9 -2
- opencompass-0.2.6/opencompass/models/huggingface_above_v4_33.py +447 -0
- opencompass-0.2.6/opencompass/models/hunyuan_api.py +151 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/lagent.py +4 -3
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/lightllm_api.py +169 -4
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/lmdeploy_pytorch.py +12 -3
- opencompass-0.2.6/opencompass/models/minimax_api.py +355 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/openai_api.py +23 -142
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/qwen_api.py +1 -2
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/sensetime_api.py +20 -11
- opencompass-0.2.6/opencompass/models/stepfun_api.py +182 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/turbomind.py +29 -12
- opencompass-0.2.6/opencompass/models/turbomind_with_tf_above_v4_33.py +197 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/vllm.py +49 -23
- opencompass-0.2.6/opencompass/models/vllm_with_tf_above_v4_33.py +134 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/xunfei_api.py +156 -0
- opencompass-0.2.4/opencompass/models/minimax_api.py → opencompass-0.2.6/opencompass/models/yi_api.py +63 -63
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/__init__.py +1 -0
- opencompass-0.2.6/opencompass/openicl/icl_evaluator/icl_bpc_evaluator.py +32 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +26 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/lm_evaluator.py +50 -61
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/__init__.py +1 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_base_inferencer.py +2 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py +21 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_ll_inferencer.py +27 -49
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_ppl_inferencer.py +35 -67
- opencompass-0.2.6/opencompass/openicl/icl_inferencer/icl_sw_ce_loss_inferencer.py +352 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/partitioners/__init__.py +0 -1
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/partitioners/num_worker.py +12 -5
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/partitioners/sub_naive.py +74 -64
- opencompass-0.2.6/opencompass/partitioners/sub_num_worker.py +209 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/partitioners/sub_size.py +106 -87
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/registry.py +0 -8
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/runners/dlc.py +36 -24
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/runners/local.py +32 -18
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/__init__.py +2 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/default.py +18 -11
- opencompass-0.2.6/opencompass/summarizers/llm_compression.py +200 -0
- opencompass-0.2.6/opencompass/summarizers/multi_faceted.py +46 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/needlebench.py +1 -1
- opencompass-0.2.6/opencompass/summarizers/subjective/__init__.py +16 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/alignmentbench.py +36 -28
- opencompass-0.2.6/opencompass/summarizers/subjective/all_obj.py +123 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/alpacaeval.py +43 -27
- opencompass-0.2.6/opencompass/summarizers/subjective/arenahard.py +342 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/compass_arena.py +20 -11
- opencompass-0.2.6/opencompass/summarizers/subjective/compassbench.py +248 -0
- opencompass-0.2.6/opencompass/summarizers/subjective/flames.py +93 -0
- opencompass-0.2.6/opencompass/summarizers/subjective/fofo.py +164 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/mtbench.py +47 -44
- opencompass-0.2.6/opencompass/summarizers/subjective/mtbench101.py +147 -0
- opencompass-0.2.6/opencompass/summarizers/subjective/subjective.py +105 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/utils.py +5 -8
- opencompass-0.2.6/opencompass/summarizers/subjective/wildbench.py +295 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/tasks/__init__.py +0 -1
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/tasks/openicl_eval.py +6 -2
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/tasks/openicl_infer.py +7 -6
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/tasks/subjective_eval.py +18 -4
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/build.py +0 -1
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/file.py +3 -3
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/prompt.py +11 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/run.py +114 -92
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/text_postprocessors.py +22 -17
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass.egg-info/PKG-INFO +20 -13
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass.egg-info/SOURCES.txt +82 -3
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass.egg-info/requires.txt +4 -2
- opencompass-0.2.4/opencompass/__init__.py +0 -1
- opencompass-0.2.4/opencompass/datasets/humaneval.py +0 -233
- opencompass-0.2.4/opencompass/datasets/mathbench.py +0 -106
- opencompass-0.2.4/opencompass/models/baichuan_api.py +0 -283
- opencompass-0.2.4/opencompass/partitioners/mm_naive.py +0 -119
- opencompass-0.2.4/opencompass/summarizers/subjective/__init__.py +0 -9
- opencompass-0.2.4/opencompass/summarizers/subjective/information_retrival.py +0 -138
- opencompass-0.2.4/opencompass/tasks/mm_infer.py +0 -160
- {opencompass-0.2.4/opencompass/datasets/teval/utils → opencompass-0.2.6/opencompass/cli}/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/FinanceIQ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/cmp_GCP_D.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/cmp_KSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/cmp_TSP_D.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/hard_GCP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/hard_MSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/hard_TSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/p_BSP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/p_EDP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/p_SPP.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/prompts.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/NPHardEval/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/OpenFinData.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/QuALITY.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/legacy.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/main.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/number_utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/TheoremQA/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/advglue.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/afqmcd.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/agieval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/constructions.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/dataset_loader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/evaluation.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/math_equivalence.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/post_process.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/agieval/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/anli.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/anthropics_evals.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/apps.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/arc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/ax.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/bbh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/boolq.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/bustum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/c3.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cb.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/ceval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/chembench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/chid.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/circular.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/civilcomments.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/clozeTest_maxmin.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cluewsc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cmb.py +1 -1
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cmmlu.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cmnli.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cmrc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/commonsenseqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/commonsenseqa_cn.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/copa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/crowspairs.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/crowspairs_cn.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/csl.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/custom.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/cvalues.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/drcd.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/drop.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/ds1000.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/ds1000_interpreter.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/eprstmt.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/flores.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/game24.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/govrepcrs.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/gsm8k.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/gsm_hard.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/hellaswag.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/huggingface.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/humaneval_multi.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/humanevalx.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/hungarian_math.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_codedebug.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_coderun.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_endia.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_enmc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_enqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_ensum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_mathcalc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_mathfind.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_retrievekv.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_retrievenumber.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_retrievepasskey.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/infinitebench_zhqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/infinitebench/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/iwslt2017.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/jigsawmultilingual.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/jsonl.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/kaoshi.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lambada.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lawbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lawbench/lawbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lcsts.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/evaluators.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_coursera.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_financial_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_gov_report_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_gsm100.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_legal_contract_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_meeting_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_multidoc_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_narrattive_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_natural_question.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_news_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_paper_assistant.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_patent_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_quality.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_review_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_scientific_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_topic_retrieval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_tpo.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/leval/leval_tvshow_summ.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lmeval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/evaluators.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_2wikim_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_dureader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_gov_report.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_hotpot_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_lcc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_lsht.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_multi_news.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_multifieldqa_en.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_multifieldqa_zh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_musique.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_narrative_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_passage_count.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_passage_retrieval_en.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_qasper.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_qmsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_repobench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_samsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_trec.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_trivia_qa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/longbench/longbench_vcsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/evaluators.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_cmrc_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_dureader_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_factrecall_en.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_factrecall_zh.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_lic_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_multifieldqa_en_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/lveval/lveval_multifieldqa_zh_mixup.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/mastermath2024v1.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/math401.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/math_intern.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/constructions.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/dataset_loader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/evaluation.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/math_equivalence.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/medbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/post_process.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/medbench/utils.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/mmlu.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/multirc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/narrativeqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/natural_question.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/natural_question_cn.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/obqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/piqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/py150.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/qasper.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/qaspercut.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/race.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/realtoxicprompts.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/reasonbench/ReasonBenchDataset.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/reasonbench/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/record.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/rolebench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/safety.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/scibench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/siqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/squad20.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/storycloze.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/strategyqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/corev2.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/creationbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/information_retrival.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/subjective/multiround.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/summedits.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/summscreen.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/svamp.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/tabmwp.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/instruct_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/planning_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/evaluators/review_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/schema.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/utils/convert_results.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/utils/format_load.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/utils/meta_template.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/teval/utils/template.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/tnews.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/triviaqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/triviaqarc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/truthfulqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/tydiqa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/wic.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/wikibench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/winograd.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/wnli.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/wsc.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/xcopa.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/xiezhi.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/xlsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/datasets/xsum.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/metrics/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/metrics/dump_results.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/metrics/mme_score.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/metrics/seedbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/accessory.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/alaya.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/base_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/bytedance_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/claude_api/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/claude_api/claude_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/claude_api/postprocessors.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/glm.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/intern_model.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/krgpt_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/langchain.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/llama2.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/lmdeploy_tis.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/mistral_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/mixtral.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/modelscope.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/moonshot_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/nanbeige_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/pangu_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/turbomind_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/turbomind_tis.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/unigpt_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/yayi_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/zhipuai_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/models/zhipuai_v2_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_dataset_reader.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_agent_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_circular_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_em_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_jieba_rouge_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_misc_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_attack_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_mink_percent_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_sc_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_inferencer/icl_tot_inferencer.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_prompt_template.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_base_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_bm25_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_dpp_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_fix_k_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_mdl_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_random_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_topk_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_votek_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/icl_retriever/icl_zero_retriever.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/utils/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/openicl/utils/logging.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/partitioners/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/partitioners/naive.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/partitioners/size.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/runners/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/runners/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/runners/local_api.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/runners/slurm.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/runners/slurm_sequential.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/circular.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/multi_model.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/corev2.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/creationbench.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/multiround.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/subjective/subjective_post_process.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/summarizers/summarizer_pretrain.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/tasks/base.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/tasks/llm_eval.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/tasks/openicl_attack.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/__init__.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/abbr.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/auxiliary.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/collect_env.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/dependency.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/fileio.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/lark.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/logging.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/menu.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass/utils/types.py +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass.egg-info/dependency_links.txt +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass.egg-info/entry_points.txt +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/opencompass.egg-info/top_level.txt +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/setup.cfg +0 -0
- {opencompass-0.2.4 → opencompass-0.2.6}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: opencompass
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: A comprehensive toolkit for large model evaluation
|
|
5
5
|
Home-page: https://github.com/open-compass/opencompass
|
|
6
6
|
Author: OpenCompass Contributors
|
|
@@ -78,6 +78,13 @@ Description: <div align="center">
|
|
|
78
78
|
|
|
79
79
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
80
80
|
|
|
81
|
+
- **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
|
|
82
|
+
- **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
|
|
83
|
+
- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
|
|
84
|
+
- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
|
|
85
|
+
- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
|
|
86
|
+
- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
|
|
87
|
+
- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
|
|
81
88
|
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
82
89
|
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
83
90
|
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
@@ -130,7 +137,7 @@ Description: <div align="center">
|
|
|
130
137
|
git clone https://github.com/open-compass/opencompass opencompass
|
|
131
138
|
cd opencompass
|
|
132
139
|
pip install -e .
|
|
133
|
-
# also please install
|
|
140
|
+
# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
|
|
134
141
|
```
|
|
135
142
|
|
|
136
143
|
### 📂 Data Preparation
|
|
@@ -153,6 +160,12 @@ Description: <div align="center">
|
|
|
153
160
|
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
|
|
154
161
|
```
|
|
155
162
|
|
|
163
|
+
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
|
|
167
|
+
```
|
|
168
|
+
|
|
156
169
|
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
|
|
157
170
|
|
|
158
171
|
```bash
|
|
@@ -165,19 +178,13 @@ Description: <div align="center">
|
|
|
165
178
|
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
|
|
166
179
|
|
|
167
180
|
```bash
|
|
168
|
-
python run.py --datasets ceval_ppl mmlu_ppl
|
|
169
|
-
--hf-path huggyllama/llama-7b \ # HuggingFace model path
|
|
170
|
-
--model-kwargs device_map='auto' \ # Arguments for model construction
|
|
171
|
-
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
|
|
172
|
-
--max-out-len 100 \ # Maximum number of tokens generated
|
|
173
|
-
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
|
174
|
-
--batch-size 8 \ # Batch size
|
|
175
|
-
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
|
|
176
|
-
--num-gpus 1 # Number of minimum required GPUs
|
|
181
|
+
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
|
|
177
182
|
```
|
|
178
183
|
|
|
179
|
-
>
|
|
180
|
-
>
|
|
184
|
+
> \[!TIP\]
|
|
185
|
+
>
|
|
186
|
+
> configuration with `_ppl` is designed for base model typically.
|
|
187
|
+
> configuration with `_gen` can be used for both base model and chat model.
|
|
181
188
|
|
|
182
189
|
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
|
183
190
|
|
|
@@ -70,6 +70,13 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
|
|
70
70
|
|
|
71
71
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
|
72
72
|
|
|
73
|
+
- **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
|
|
74
|
+
- **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
|
|
75
|
+
- **\[2024.05.08\]** We supported the evaluation of 4 MoE models: [Mixtral-8x22B-v0.1](configs/models/mixtral/hf_mixtral_8x22b_v0_1.py), [Mixtral-8x22B-Instruct-v0.1](configs/models/mixtral/hf_mixtral_8x22b_instruct_v0_1.py), [Qwen1.5-MoE-A2.7B](configs/models/qwen/hf_qwen1_5_moe_a2_7b.py), [Qwen1.5-MoE-A2.7B-Chat](configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py). Try them out now!
|
|
76
|
+
- **\[2024.04.30\]** We supported evaluating a model's compression efficiency by calculating its Bits per Character (BPC) metric on an [external corpora](configs/datasets/llm_compression/README.md) ([official paper](https://github.com/hkust-nlp/llm-compression-intelligence)). Check out the [llm-compression](configs/eval_llm_compression.py) evaluation config now! 🔥🔥🔥
|
|
77
|
+
- **\[2024.04.29\]** We report the performance of several famous LLMs on the common benchmarks, welcome to [documentation](https://opencompass.readthedocs.io/en/latest/user_guides/corebench.html) for more information! 🔥🔥🔥.
|
|
78
|
+
- **\[2024.04.26\]** We deprecated the multi-madality evaluating function from OpenCompass, related implement has moved to [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), welcome to use! 🔥🔥🔥.
|
|
79
|
+
- **\[2024.04.26\]** We supported the evaluation of [ArenaHard](configs/eval_subjective_arena_hard.py) welcome to try!🔥🔥🔥.
|
|
73
80
|
- **\[2024.04.22\]** We supported the evaluation of [LLaMA3](configs/models/hf_llama/hf_llama3_8b.py) 和 [LLaMA3-Instruct](configs/models/hf_llama/hf_llama3_8b_instruct.py), welcome to try! 🔥🔥🔥
|
|
74
81
|
- **\[2024.02.29\]** We supported the MT-Bench, AlpacalEval and AlignBench, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/subjective_evaluation.html)
|
|
75
82
|
- **\[2024.01.30\]** We release OpenCompass 2.0. Click [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home) for more information !
|
|
@@ -122,7 +129,7 @@ conda activate opencompass
|
|
|
122
129
|
git clone https://github.com/open-compass/opencompass opencompass
|
|
123
130
|
cd opencompass
|
|
124
131
|
pip install -e .
|
|
125
|
-
# also please install
|
|
132
|
+
# also please install requirements packages via `pip install -r requirements/api.txt` for API models if needed.
|
|
126
133
|
```
|
|
127
134
|
|
|
128
135
|
### 📂 Data Preparation
|
|
@@ -145,6 +152,12 @@ After ensuring that OpenCompass is installed correctly according to the above st
|
|
|
145
152
|
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
|
|
146
153
|
```
|
|
147
154
|
|
|
155
|
+
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl -a lmdeploy
|
|
159
|
+
```
|
|
160
|
+
|
|
148
161
|
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
|
|
149
162
|
|
|
150
163
|
```bash
|
|
@@ -157,19 +170,13 @@ python tools/list_configs.py llama mmlu
|
|
|
157
170
|
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
|
|
158
171
|
|
|
159
172
|
```bash
|
|
160
|
-
python run.py --datasets ceval_ppl mmlu_ppl
|
|
161
|
-
--hf-path huggyllama/llama-7b \ # HuggingFace model path
|
|
162
|
-
--model-kwargs device_map='auto' \ # Arguments for model construction
|
|
163
|
-
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
|
|
164
|
-
--max-out-len 100 \ # Maximum number of tokens generated
|
|
165
|
-
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
|
166
|
-
--batch-size 8 \ # Batch size
|
|
167
|
-
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
|
|
168
|
-
--num-gpus 1 # Number of minimum required GPUs
|
|
173
|
+
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
|
|
169
174
|
```
|
|
170
175
|
|
|
171
|
-
>
|
|
172
|
-
>
|
|
176
|
+
> \[!TIP\]
|
|
177
|
+
>
|
|
178
|
+
> configuration with `_ppl` is designed for base model typically.
|
|
179
|
+
> configuration with `_gen` can be used for both base model and chat model.
|
|
173
180
|
|
|
174
181
|
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
|
175
182
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.2.6'
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
# yapf: disable
|
|
3
|
+
import argparse
|
|
4
|
+
import getpass
|
|
5
|
+
import os
|
|
6
|
+
import os.path as osp
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
from mmengine.config import Config, DictAction
|
|
10
|
+
|
|
11
|
+
from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
|
|
12
|
+
from opencompass.runners import SlurmRunner
|
|
13
|
+
from opencompass.summarizers import DefaultSummarizer
|
|
14
|
+
from opencompass.utils import LarkReporter, get_logger
|
|
15
|
+
from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
|
|
16
|
+
get_config_from_arg)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_args():
|
|
20
|
+
parser = argparse.ArgumentParser(description='Run an evaluation task')
|
|
21
|
+
parser.add_argument('config', nargs='?', help='Train config file path')
|
|
22
|
+
|
|
23
|
+
# add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
|
|
24
|
+
# if "infer" or "eval" not specified
|
|
25
|
+
launch_method = parser.add_mutually_exclusive_group()
|
|
26
|
+
launch_method.add_argument('--slurm',
|
|
27
|
+
action='store_true',
|
|
28
|
+
default=False,
|
|
29
|
+
help='Whether to force tasks to run with srun. '
|
|
30
|
+
'If True, `--partition(-p)` must be set. '
|
|
31
|
+
'Defaults to False')
|
|
32
|
+
launch_method.add_argument('--dlc',
|
|
33
|
+
action='store_true',
|
|
34
|
+
default=False,
|
|
35
|
+
help='Whether to force tasks to run on dlc. If '
|
|
36
|
+
'True, `--aliyun-cfg` must be set. Defaults'
|
|
37
|
+
' to False')
|
|
38
|
+
# Add shortcut parameters (models, datasets and summarizer)
|
|
39
|
+
parser.add_argument('--models', nargs='+', help='', default=None)
|
|
40
|
+
parser.add_argument('--datasets', nargs='+', help='', default=None)
|
|
41
|
+
parser.add_argument('--summarizer', help='', default=None)
|
|
42
|
+
# add general args
|
|
43
|
+
parser.add_argument('--debug',
|
|
44
|
+
help='Debug mode, in which scheduler will run tasks '
|
|
45
|
+
'in the single process, and output will not be '
|
|
46
|
+
'redirected to files',
|
|
47
|
+
action='store_true',
|
|
48
|
+
default=False)
|
|
49
|
+
parser.add_argument('--dry-run',
|
|
50
|
+
help='Dry run mode, in which the scheduler will not '
|
|
51
|
+
'actually run the tasks, but only print the commands '
|
|
52
|
+
'to run',
|
|
53
|
+
action='store_true',
|
|
54
|
+
default=False)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
'-a', '--accelerator',
|
|
57
|
+
help='Infer accelerator, support vllm and lmdeploy now.',
|
|
58
|
+
choices=['vllm', 'lmdeploy', None],
|
|
59
|
+
default=None,
|
|
60
|
+
type=str)
|
|
61
|
+
parser.add_argument('-m',
|
|
62
|
+
'--mode',
|
|
63
|
+
help='Running mode. You can choose "infer" if you '
|
|
64
|
+
'only want the inference results, or "eval" if you '
|
|
65
|
+
'already have the results and want to evaluate them, '
|
|
66
|
+
'or "viz" if you want to visualize the results.',
|
|
67
|
+
choices=['all', 'infer', 'eval', 'viz'],
|
|
68
|
+
default='all',
|
|
69
|
+
type=str)
|
|
70
|
+
parser.add_argument('-r',
|
|
71
|
+
'--reuse',
|
|
72
|
+
nargs='?',
|
|
73
|
+
type=str,
|
|
74
|
+
const='latest',
|
|
75
|
+
help='Reuse previous outputs & results, and run any '
|
|
76
|
+
'missing jobs presented in the config. If its '
|
|
77
|
+
'argument is not specified, the latest results in '
|
|
78
|
+
'the work_dir will be reused. The argument should '
|
|
79
|
+
'also be a specific timestamp, e.g. 20230516_144254')
|
|
80
|
+
parser.add_argument('-w',
|
|
81
|
+
'--work-dir',
|
|
82
|
+
help='Work path, all the outputs will be '
|
|
83
|
+
'saved in this path, including the slurm logs, '
|
|
84
|
+
'the evaluation results, the summary results, etc.'
|
|
85
|
+
'If not specified, the work_dir will be set to '
|
|
86
|
+
'outputs/default.',
|
|
87
|
+
default=None,
|
|
88
|
+
type=str)
|
|
89
|
+
parser.add_argument(
|
|
90
|
+
'--config-dir',
|
|
91
|
+
default='configs',
|
|
92
|
+
help='Use the custom config directory instead of config/ to '
|
|
93
|
+
'search the configs for datasets, models and summarizers',
|
|
94
|
+
type=str)
|
|
95
|
+
parser.add_argument('-l',
|
|
96
|
+
'--lark',
|
|
97
|
+
help='Report the running status to lark bot',
|
|
98
|
+
action='store_true',
|
|
99
|
+
default=False)
|
|
100
|
+
parser.add_argument('--max-num-workers',
|
|
101
|
+
help='Max number of workers to run in parallel. '
|
|
102
|
+
'Will be overrideen by the "max_num_workers" argument '
|
|
103
|
+
'in the config.',
|
|
104
|
+
type=int,
|
|
105
|
+
default=1)
|
|
106
|
+
parser.add_argument('--max-workers-per-gpu',
|
|
107
|
+
help='Max task to run in parallel on one GPU. '
|
|
108
|
+
'It will only be used in the local runner.',
|
|
109
|
+
type=int,
|
|
110
|
+
default=1)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
'--retry',
|
|
113
|
+
help='Number of retries if the job failed when using slurm or dlc. '
|
|
114
|
+
'Will be overrideen by the "retry" argument in the config.',
|
|
115
|
+
type=int,
|
|
116
|
+
default=2)
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
'--dump-eval-details',
|
|
119
|
+
help='Whether to dump the evaluation details, including the '
|
|
120
|
+
'correctness of each sample, bpb, etc.',
|
|
121
|
+
action='store_true',
|
|
122
|
+
)
|
|
123
|
+
# set srun args
|
|
124
|
+
slurm_parser = parser.add_argument_group('slurm_args')
|
|
125
|
+
parse_slurm_args(slurm_parser)
|
|
126
|
+
# set dlc args
|
|
127
|
+
dlc_parser = parser.add_argument_group('dlc_args')
|
|
128
|
+
parse_dlc_args(dlc_parser)
|
|
129
|
+
# set hf args
|
|
130
|
+
hf_parser = parser.add_argument_group('hf_args')
|
|
131
|
+
parse_hf_args(hf_parser)
|
|
132
|
+
# set custom dataset args
|
|
133
|
+
custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
|
|
134
|
+
parse_custom_dataset_args(custom_dataset_parser)
|
|
135
|
+
args = parser.parse_args()
|
|
136
|
+
if args.slurm:
|
|
137
|
+
assert args.partition is not None, (
|
|
138
|
+
'--partition(-p) must be set if you want to use slurm')
|
|
139
|
+
if args.dlc:
|
|
140
|
+
assert os.path.exists(args.aliyun_cfg), (
|
|
141
|
+
'When launching tasks using dlc, it needs to be configured '
|
|
142
|
+
'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
|
|
143
|
+
' to specify a new path.')
|
|
144
|
+
return args
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def parse_slurm_args(slurm_parser):
|
|
148
|
+
"""These args are all for slurm launch."""
|
|
149
|
+
slurm_parser.add_argument('-p',
|
|
150
|
+
'--partition',
|
|
151
|
+
help='Slurm partition name',
|
|
152
|
+
default=None,
|
|
153
|
+
type=str)
|
|
154
|
+
slurm_parser.add_argument('-q',
|
|
155
|
+
'--quotatype',
|
|
156
|
+
help='Slurm quota type',
|
|
157
|
+
default=None,
|
|
158
|
+
type=str)
|
|
159
|
+
slurm_parser.add_argument('--qos',
|
|
160
|
+
help='Slurm quality of service',
|
|
161
|
+
default=None,
|
|
162
|
+
type=str)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def parse_dlc_args(dlc_parser):
|
|
166
|
+
"""These args are all for dlc launch."""
|
|
167
|
+
dlc_parser.add_argument('--aliyun-cfg',
|
|
168
|
+
help='The config path for aliyun config',
|
|
169
|
+
default='~/.aliyun.cfg',
|
|
170
|
+
type=str)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def parse_hf_args(hf_parser):
|
|
176
|
+
"""These args are all for the quick construction of HuggingFace models."""
|
|
177
|
+
hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
|
|
178
|
+
hf_parser.add_argument('--hf-path', type=str, help='The path to the HuggingFace model, e.g. "facebook/opt-125m", required')
|
|
179
|
+
hf_parser.add_argument('--model-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the HuggingFace model')
|
|
180
|
+
hf_parser.add_argument('--tokenizer-path', type=str, help='The path to the HuggingFace tokenizer, same as --hf-path if not specified')
|
|
181
|
+
hf_parser.add_argument('--tokenizer-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the tokenizer')
|
|
182
|
+
hf_parser.add_argument('--peft-path', type=str, help='The path to the PEFT model')
|
|
183
|
+
hf_parser.add_argument('--peft-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the PEFT model')
|
|
184
|
+
hf_parser.add_argument('--generation-kwargs', nargs='+', action=DictAction, default={}, help='The kwargs for the generation')
|
|
185
|
+
hf_parser.add_argument('--max-seq-len', type=int, help='The max sequence length for the HuggingFace model')
|
|
186
|
+
hf_parser.add_argument('--max-out-len', type=int, default=256, help='The max output length for the HuggingFace model')
|
|
187
|
+
hf_parser.add_argument('--min-out-len', type=int, default=1, help='The min output length for the HuggingFace model')
|
|
188
|
+
hf_parser.add_argument('--batch-size', type=int, default=8, help='The batch size for the HuggingFace model')
|
|
189
|
+
hf_parser.add_argument('--num-gpus', type=int, default=None, help='Deprecated, please use --hf-num-gpus instead')
|
|
190
|
+
hf_parser.add_argument('--hf-num-gpus', type=int, default=1, help='The number of GPUs for the HuggingFace model passed via cli')
|
|
191
|
+
hf_parser.add_argument('--pad-token-id', type=int, help='The pad token id for the HuggingFace model')
|
|
192
|
+
hf_parser.add_argument('--stop-words', nargs='+', default=[], help='The stop words for the HuggingFace model')
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def parse_custom_dataset_args(custom_dataset_parser):
|
|
196
|
+
"""These args are all for the quick construction of custom datasets."""
|
|
197
|
+
custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
|
|
198
|
+
custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
|
|
199
|
+
custom_dataset_parser.add_argument('--custom-dataset-data-type',
|
|
200
|
+
type=str,
|
|
201
|
+
choices=['mcq', 'qa'])
|
|
202
|
+
custom_dataset_parser.add_argument('--custom-dataset-infer-method',
|
|
203
|
+
type=str,
|
|
204
|
+
choices=['gen', 'ppl'])
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def main():
|
|
208
|
+
args = parse_args()
|
|
209
|
+
|
|
210
|
+
if args.num_gpus is not None:
|
|
211
|
+
raise ValueError('The `--num-gpus` argument is deprecated, please use '
|
|
212
|
+
'`--hf-num-gpus` to describe number of gpus used for '
|
|
213
|
+
'the HuggingFace model instead.')
|
|
214
|
+
|
|
215
|
+
if args.dry_run:
|
|
216
|
+
args.debug = True
|
|
217
|
+
# initialize logger
|
|
218
|
+
logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
|
|
219
|
+
|
|
220
|
+
cfg = get_config_from_arg(args)
|
|
221
|
+
if args.work_dir is not None:
|
|
222
|
+
cfg['work_dir'] = args.work_dir
|
|
223
|
+
else:
|
|
224
|
+
cfg.setdefault('work_dir', os.path.join('outputs', 'default'))
|
|
225
|
+
|
|
226
|
+
# cfg_time_str defaults to the current time
|
|
227
|
+
cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
228
|
+
if args.reuse:
|
|
229
|
+
if args.reuse == 'latest':
|
|
230
|
+
if not os.path.exists(cfg.work_dir) or not os.listdir(
|
|
231
|
+
cfg.work_dir):
|
|
232
|
+
logger.warning('No previous results to reuse!')
|
|
233
|
+
else:
|
|
234
|
+
dirs = os.listdir(cfg.work_dir)
|
|
235
|
+
dir_time_str = sorted(dirs)[-1]
|
|
236
|
+
else:
|
|
237
|
+
dir_time_str = args.reuse
|
|
238
|
+
logger.info(f'Reusing experiements from {dir_time_str}')
|
|
239
|
+
elif args.mode in ['eval', 'viz']:
|
|
240
|
+
raise ValueError('You must specify -r or --reuse when running in eval '
|
|
241
|
+
'or viz mode!')
|
|
242
|
+
|
|
243
|
+
# update "actual" work_dir
|
|
244
|
+
cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
|
|
245
|
+
current_workdir = cfg['work_dir']
|
|
246
|
+
logger.info(f'Current exp folder: {current_workdir}')
|
|
247
|
+
|
|
248
|
+
os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
|
|
249
|
+
|
|
250
|
+
# dump config
|
|
251
|
+
output_config_path = osp.join(cfg.work_dir, 'configs',
|
|
252
|
+
f'{cfg_time_str}_{os.getpid()}.py')
|
|
253
|
+
cfg.dump(output_config_path)
|
|
254
|
+
# Config is intentally reloaded here to avoid initialized
|
|
255
|
+
# types cannot be serialized
|
|
256
|
+
cfg = Config.fromfile(output_config_path, format_python_code=False)
|
|
257
|
+
|
|
258
|
+
# report to lark bot if specify --lark
|
|
259
|
+
if not args.lark:
|
|
260
|
+
cfg['lark_bot_url'] = None
|
|
261
|
+
elif cfg.get('lark_bot_url', None):
|
|
262
|
+
content = f'{getpass.getuser()}\'s task has been launched!'
|
|
263
|
+
LarkReporter(cfg['lark_bot_url']).post(content)
|
|
264
|
+
|
|
265
|
+
if args.mode in ['all', 'infer']:
|
|
266
|
+
# When user have specified --slurm or --dlc, or have not set
|
|
267
|
+
# "infer" in config, we will provide a default configuration
|
|
268
|
+
# for infer
|
|
269
|
+
if (args.dlc or args.slurm) and cfg.get('infer', None):
|
|
270
|
+
logger.warning('You have set "infer" in the config, but '
|
|
271
|
+
'also specified --slurm or --dlc. '
|
|
272
|
+
'The "infer" configuration will be overridden by '
|
|
273
|
+
'your runtime arguments.')
|
|
274
|
+
|
|
275
|
+
if args.dlc or args.slurm or cfg.get('infer', None) is None:
|
|
276
|
+
fill_infer_cfg(cfg, args)
|
|
277
|
+
|
|
278
|
+
if args.partition is not None:
|
|
279
|
+
if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
|
|
280
|
+
cfg.infer.runner.partition = args.partition
|
|
281
|
+
cfg.infer.runner.quotatype = args.quotatype
|
|
282
|
+
else:
|
|
283
|
+
logger.warning('SlurmRunner is not used, so the partition '
|
|
284
|
+
'argument is ignored.')
|
|
285
|
+
if args.debug:
|
|
286
|
+
cfg.infer.runner.debug = True
|
|
287
|
+
if args.lark:
|
|
288
|
+
cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
|
|
289
|
+
cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
|
|
290
|
+
'predictions/')
|
|
291
|
+
partitioner = PARTITIONERS.build(cfg.infer.partitioner)
|
|
292
|
+
tasks = partitioner(cfg)
|
|
293
|
+
if args.dry_run:
|
|
294
|
+
return
|
|
295
|
+
runner = RUNNERS.build(cfg.infer.runner)
|
|
296
|
+
# Add extra attack config if exists
|
|
297
|
+
if hasattr(cfg, 'attack'):
|
|
298
|
+
for task in tasks:
|
|
299
|
+
cfg.attack.dataset = task.datasets[0][0].abbr
|
|
300
|
+
task.attack = cfg.attack
|
|
301
|
+
runner(tasks)
|
|
302
|
+
|
|
303
|
+
# evaluate
|
|
304
|
+
if args.mode in ['all', 'eval']:
|
|
305
|
+
# When user have specified --slurm or --dlc, or have not set
|
|
306
|
+
# "eval" in config, we will provide a default configuration
|
|
307
|
+
# for eval
|
|
308
|
+
if (args.dlc or args.slurm) and cfg.get('eval', None):
|
|
309
|
+
logger.warning('You have set "eval" in the config, but '
|
|
310
|
+
'also specified --slurm or --dlc. '
|
|
311
|
+
'The "eval" configuration will be overridden by '
|
|
312
|
+
'your runtime arguments.')
|
|
313
|
+
|
|
314
|
+
if args.dlc or args.slurm or cfg.get('eval', None) is None:
|
|
315
|
+
fill_eval_cfg(cfg, args)
|
|
316
|
+
if args.dump_eval_details:
|
|
317
|
+
cfg.eval.runner.task.dump_details = True
|
|
318
|
+
|
|
319
|
+
if args.partition is not None:
|
|
320
|
+
if RUNNERS.get(cfg.eval.runner.type) == SlurmRunner:
|
|
321
|
+
cfg.eval.runner.partition = args.partition
|
|
322
|
+
cfg.eval.runner.quotatype = args.quotatype
|
|
323
|
+
else:
|
|
324
|
+
logger.warning('SlurmRunner is not used, so the partition '
|
|
325
|
+
'argument is ignored.')
|
|
326
|
+
if args.debug:
|
|
327
|
+
cfg.eval.runner.debug = True
|
|
328
|
+
if args.lark:
|
|
329
|
+
cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
|
|
330
|
+
cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
|
|
331
|
+
partitioner = PARTITIONERS.build(cfg.eval.partitioner)
|
|
332
|
+
tasks = partitioner(cfg)
|
|
333
|
+
if args.dry_run:
|
|
334
|
+
return
|
|
335
|
+
runner = RUNNERS.build(cfg.eval.runner)
|
|
336
|
+
|
|
337
|
+
# For meta-review-judge in subjective evaluation
|
|
338
|
+
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
|
|
339
|
+
tasks[0], list):
|
|
340
|
+
for task_part in tasks:
|
|
341
|
+
runner(task_part)
|
|
342
|
+
else:
|
|
343
|
+
runner(tasks)
|
|
344
|
+
|
|
345
|
+
# visualize
|
|
346
|
+
if args.mode in ['all', 'eval', 'viz']:
|
|
347
|
+
summarizer_cfg = cfg.get('summarizer', {})
|
|
348
|
+
|
|
349
|
+
# For subjective summarizer
|
|
350
|
+
if summarizer_cfg.get('function', None):
|
|
351
|
+
main_summarizer_cfg = copy.deepcopy(summarizer_cfg)
|
|
352
|
+
grouped_datasets = {}
|
|
353
|
+
for dataset in cfg.datasets:
|
|
354
|
+
prefix = dataset['abbr'].split('_')[0]
|
|
355
|
+
if prefix not in grouped_datasets:
|
|
356
|
+
grouped_datasets[prefix] = []
|
|
357
|
+
grouped_datasets[prefix].append(dataset)
|
|
358
|
+
all_grouped_lists = []
|
|
359
|
+
for prefix in grouped_datasets:
|
|
360
|
+
all_grouped_lists.append(grouped_datasets[prefix])
|
|
361
|
+
dataset_score_container = []
|
|
362
|
+
for dataset in all_grouped_lists:
|
|
363
|
+
temp_cfg = copy.deepcopy(cfg)
|
|
364
|
+
temp_cfg.datasets = dataset
|
|
365
|
+
summarizer_cfg = dict(type=dataset[0]['summarizer']['type'], config=temp_cfg)
|
|
366
|
+
summarizer = build_from_cfg(summarizer_cfg)
|
|
367
|
+
dataset_score = summarizer.summarize(time_str=cfg_time_str)
|
|
368
|
+
if dataset_score:
|
|
369
|
+
dataset_score_container.append(dataset_score)
|
|
370
|
+
main_summarizer_cfg['config'] = cfg
|
|
371
|
+
main_summarizer = build_from_cfg(main_summarizer_cfg)
|
|
372
|
+
main_summarizer.summarize(time_str=cfg_time_str, subjective_scores=dataset_score_container)
|
|
373
|
+
else:
|
|
374
|
+
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
|
|
375
|
+
summarizer_cfg['type'] = DefaultSummarizer
|
|
376
|
+
summarizer_cfg['config'] = cfg
|
|
377
|
+
summarizer = build_from_cfg(summarizer_cfg)
|
|
378
|
+
summarizer.summarize(time_str=cfg_time_str)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
if __name__ == '__main__':
|
|
383
|
+
main()
|
|
@@ -91,34 +91,51 @@ class GaokaoBenchEvaluator(BaseEvaluator):
|
|
|
91
91
|
]:
|
|
92
92
|
return {'score': 0}
|
|
93
93
|
elif self.question_type == 'multi_choice':
|
|
94
|
+
details = {}
|
|
94
95
|
correct_score, total_score = 0, 0
|
|
95
|
-
for pred, refr in zip(predictions, references):
|
|
96
|
+
for index, (pred, refr) in enumerate(zip(predictions, references)):
|
|
96
97
|
pred = self.do_predictions_postprocess(pred)
|
|
97
98
|
pred = self.ensure_same_length(pred, refr)
|
|
99
|
+
is_corrects = []
|
|
98
100
|
for p, r in zip(pred, refr):
|
|
99
101
|
if p == r:
|
|
100
102
|
correct_score += 2
|
|
103
|
+
is_corrects.append(True)
|
|
101
104
|
else:
|
|
102
105
|
for i in p:
|
|
103
106
|
if i not in r:
|
|
104
107
|
break
|
|
105
108
|
else:
|
|
106
109
|
correct_score += 1
|
|
110
|
+
is_corrects.append(False)
|
|
107
111
|
total_score += 2
|
|
108
|
-
|
|
112
|
+
details[str(index)] = {
|
|
113
|
+
'pred': pred,
|
|
114
|
+
'refr': refr,
|
|
115
|
+
'is_correct': all(is_corrects),
|
|
116
|
+
}
|
|
117
|
+
|
|
109
118
|
else:
|
|
119
|
+
details = {}
|
|
110
120
|
correct_score, total_score = 0, 0
|
|
111
|
-
for pred, refr in zip(predictions, references):
|
|
121
|
+
for index, (pred, refr) in enumerate(zip(predictions, references)):
|
|
112
122
|
if self.question_type == 'multi_question_choice':
|
|
113
123
|
pred = self.do_predictions_postprocess(pred, len(refr))
|
|
114
124
|
else:
|
|
115
125
|
pred = self.do_predictions_postprocess(pred)
|
|
116
126
|
pred = self.ensure_same_length(pred, refr)
|
|
127
|
+
is_corrects = []
|
|
117
128
|
for p, r in zip(pred, refr):
|
|
118
|
-
|
|
119
|
-
|
|
129
|
+
is_correct = p == r
|
|
130
|
+
correct_score += is_correct
|
|
120
131
|
total_score += 1
|
|
121
|
-
|
|
132
|
+
is_corrects.append(is_correct)
|
|
133
|
+
details[str(index)] = {
|
|
134
|
+
'pred': pred,
|
|
135
|
+
'refr': refr,
|
|
136
|
+
'is_correct': all(is_corrects),
|
|
137
|
+
}
|
|
138
|
+
return {'score': correct_score / total_score * 100, 'details': details}
|
|
122
139
|
|
|
123
140
|
|
|
124
141
|
for question_type in valid_gaokao_bench_question_types:
|
|
File without changes
|