evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/config.py
CHANGED
|
@@ -1,85 +1,253 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
2
|
+
# flake8: noqa: E501
|
|
3
3
|
import copy
|
|
4
|
-
import json
|
|
5
4
|
import os
|
|
6
5
|
from argparse import Namespace
|
|
7
6
|
from dataclasses import dataclass, field
|
|
8
7
|
from typing import Dict, List, Optional, Union
|
|
9
8
|
|
|
10
|
-
from evalscope.
|
|
11
|
-
from evalscope.
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
from evalscope.api.model import GenerateConfig, Model, ModelAPI
|
|
10
|
+
from evalscope.constants import (
|
|
11
|
+
DEFAULT_DATASET_CACHE_DIR,
|
|
12
|
+
DEFAULT_WORK_DIR,
|
|
13
|
+
EvalBackend,
|
|
14
|
+
EvalType,
|
|
15
|
+
HubType,
|
|
16
|
+
JudgeStrategy,
|
|
17
|
+
ModelTask,
|
|
18
|
+
)
|
|
19
|
+
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
20
|
+
from evalscope.utils.deprecation_utils import deprecated_warning
|
|
21
|
+
from evalscope.utils.import_utils import check_import
|
|
22
|
+
from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
|
|
14
23
|
from evalscope.utils.logger import get_logger
|
|
24
|
+
from evalscope.version import __version__ as evalscope_version
|
|
15
25
|
|
|
16
26
|
logger = get_logger()
|
|
17
27
|
|
|
18
|
-
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
19
|
-
|
|
20
|
-
DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
|
|
21
|
-
DEFAULT_GENERATION_CONFIG = {
|
|
22
|
-
'max_length': 2048,
|
|
23
|
-
'max_new_tokens': 512,
|
|
24
|
-
'do_sample': False,
|
|
25
|
-
'top_k': 50,
|
|
26
|
-
'top_p': 1.0,
|
|
27
|
-
'temperature': 1.0,
|
|
28
|
-
}
|
|
29
|
-
|
|
30
28
|
|
|
31
29
|
@dataclass
|
|
32
|
-
class TaskConfig:
|
|
30
|
+
class TaskConfig(BaseArgument):
|
|
33
31
|
# Model-related arguments
|
|
34
|
-
model: Union[str,
|
|
32
|
+
model: Optional[Union[str, Model, ModelAPI]] = None
|
|
33
|
+
"""The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
|
|
34
|
+
|
|
35
35
|
model_id: Optional[str] = None
|
|
36
|
-
|
|
36
|
+
"""Unique identifier for the model. Auto-generated from model name if not provided."""
|
|
37
|
+
|
|
38
|
+
model_args: Dict = field(default_factory=dict)
|
|
39
|
+
"""Additional arguments to pass to the model during initialization."""
|
|
40
|
+
|
|
41
|
+
model_task: str = ModelTask.TEXT_GENERATION
|
|
42
|
+
"""The type of task the model performs (e.g., text generation, image generation)."""
|
|
37
43
|
|
|
38
44
|
# Template-related arguments
|
|
39
|
-
template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
40
45
|
chat_template: Optional[str] = None
|
|
46
|
+
"""Chat template to use for formatting conversations with the model."""
|
|
41
47
|
|
|
42
48
|
# Dataset-related arguments
|
|
43
49
|
datasets: List[str] = field(default_factory=list)
|
|
50
|
+
"""List of dataset names to evaluate the model on."""
|
|
51
|
+
|
|
44
52
|
dataset_args: Dict = field(default_factory=dict)
|
|
53
|
+
"""Additional arguments to pass to datasets during loading."""
|
|
54
|
+
|
|
45
55
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
56
|
+
"""Directory where datasets are cached locally."""
|
|
57
|
+
|
|
46
58
|
dataset_hub: str = HubType.MODELSCOPE
|
|
59
|
+
"""Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
|
|
60
|
+
|
|
61
|
+
repeats: int = 1
|
|
62
|
+
"""Number of times to repeat the dataset items for k-metrics evaluation."""
|
|
47
63
|
|
|
48
64
|
# Generation configuration arguments
|
|
49
|
-
generation_config:
|
|
65
|
+
generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
|
|
66
|
+
"""Configuration parameters for text/image generation."""
|
|
50
67
|
|
|
51
68
|
# Evaluation-related arguments
|
|
52
69
|
eval_type: str = EvalType.CHECKPOINT
|
|
70
|
+
"""Type of evaluation: checkpoint, service, or mock."""
|
|
71
|
+
|
|
53
72
|
eval_backend: str = EvalBackend.NATIVE
|
|
73
|
+
"""Backend framework to use for evaluation."""
|
|
74
|
+
|
|
54
75
|
eval_config: Union[str, Dict, None] = None
|
|
55
|
-
|
|
56
|
-
|
|
76
|
+
"""Additional evaluation configuration parameters."""
|
|
77
|
+
|
|
78
|
+
limit: Optional[Union[int, float]] = None
|
|
79
|
+
"""Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
|
|
80
|
+
|
|
81
|
+
eval_batch_size: int = 1
|
|
82
|
+
"""Batch size for evaluation processing."""
|
|
57
83
|
|
|
58
84
|
# Cache and working directory arguments
|
|
59
|
-
mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
|
|
60
85
|
use_cache: Optional[str] = None
|
|
86
|
+
"""Whether to use cached results and which cache strategy to apply."""
|
|
87
|
+
|
|
88
|
+
rerun_review: bool = False
|
|
89
|
+
"""Whether to rerun the review process even if results exist."""
|
|
90
|
+
|
|
61
91
|
work_dir: str = DEFAULT_WORK_DIR
|
|
62
|
-
|
|
92
|
+
"""Working directory for storing evaluation results and temporary files."""
|
|
63
93
|
|
|
64
94
|
# Debug and runtime mode arguments
|
|
95
|
+
ignore_errors: bool = False
|
|
96
|
+
"""Whether to continue evaluation when encountering errors."""
|
|
97
|
+
|
|
65
98
|
debug: bool = False
|
|
66
|
-
|
|
99
|
+
"""Enable debug mode for detailed logging and error reporting."""
|
|
100
|
+
|
|
67
101
|
seed: Optional[int] = 42
|
|
68
|
-
|
|
69
|
-
|
|
102
|
+
"""Random seed for reproducible results."""
|
|
103
|
+
|
|
104
|
+
api_url: Optional[str] = None
|
|
105
|
+
"""API endpoint URL for server-based model evaluation."""
|
|
106
|
+
|
|
107
|
+
api_key: Optional[str] = 'EMPTY'
|
|
108
|
+
"""API key for authenticating with server-based models."""
|
|
109
|
+
|
|
110
|
+
timeout: Optional[float] = None
|
|
111
|
+
"""Request timeout in seconds for server-based models."""
|
|
112
|
+
|
|
113
|
+
stream: Optional[bool] = None
|
|
114
|
+
"""Whether to use streaming responses for server-based models."""
|
|
115
|
+
|
|
116
|
+
# LLMJudge arguments
|
|
117
|
+
judge_strategy: str = JudgeStrategy.AUTO
|
|
118
|
+
"""Strategy for LLM-based judgment (auto, single, pairwise)."""
|
|
119
|
+
|
|
120
|
+
judge_worker_num: int = 1
|
|
121
|
+
"""Number of worker processes for parallel LLM judging."""
|
|
122
|
+
|
|
123
|
+
judge_model_args: Optional[Dict] = field(default_factory=dict)
|
|
124
|
+
"""Additional arguments for the judge model configuration."""
|
|
125
|
+
|
|
126
|
+
analysis_report: bool = False
|
|
127
|
+
"""Whether to generate detailed analysis reports after evaluation."""
|
|
128
|
+
|
|
129
|
+
# Sandbox configuration arguments
|
|
130
|
+
use_sandbox: bool = False
|
|
131
|
+
"""Whether to execute code in a sandboxed environment."""
|
|
132
|
+
|
|
133
|
+
sandbox_type: Optional[str] = 'docker'
|
|
134
|
+
"""Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
|
|
135
|
+
|
|
136
|
+
sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
|
|
137
|
+
"""Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
|
|
138
|
+
|
|
139
|
+
sandbox_config: Optional[Dict] = field(default_factory=dict)
|
|
140
|
+
"""Configuration for sandboxed code execution environments."""
|
|
141
|
+
|
|
142
|
+
evalscope_version: Optional[str] = evalscope_version
|
|
143
|
+
"""EvalScope version used for the evaluation."""
|
|
70
144
|
|
|
71
145
|
def __post_init__(self):
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
146
|
+
self.__init_model_and_id()
|
|
147
|
+
|
|
148
|
+
self.__init_eval_data_config()
|
|
149
|
+
|
|
150
|
+
# Set default generation_config and model_args
|
|
151
|
+
self.__init_default_generation_config()
|
|
152
|
+
self.__init_default_model_args()
|
|
153
|
+
self.__init_default_sandbox_config()
|
|
154
|
+
|
|
155
|
+
def __init_model_and_id(self):
|
|
156
|
+
# Set model to DummyCustomModel if not provided
|
|
157
|
+
if self.model is None:
|
|
158
|
+
self.model = self.model_task
|
|
159
|
+
self.eval_type = EvalType.MOCK_LLM
|
|
160
|
+
|
|
161
|
+
# Set model_id if not provided
|
|
162
|
+
if not self.model_id:
|
|
163
|
+
if isinstance(self.model, str):
|
|
164
|
+
self.model_id = safe_filename(os.path.basename(self.model))
|
|
165
|
+
elif isinstance(self.model, Model):
|
|
166
|
+
self.model_id = safe_filename(self.model.name)
|
|
167
|
+
elif isinstance(self.model, ModelAPI):
|
|
168
|
+
self.model_id = safe_filename(self.model.model_name)
|
|
75
169
|
else:
|
|
76
|
-
self.model_id =
|
|
77
|
-
|
|
78
|
-
def
|
|
79
|
-
|
|
170
|
+
self.model_id = 'dummy_model'
|
|
171
|
+
|
|
172
|
+
def __init_eval_data_config(self):
|
|
173
|
+
# Post process limit
|
|
174
|
+
if self.limit is not None:
|
|
175
|
+
self.limit = parse_int_or_float(self.limit)
|
|
176
|
+
|
|
177
|
+
def __init_default_generation_config(self):
|
|
178
|
+
if not self.generation_config:
|
|
179
|
+
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
180
|
+
self.generation_config = {
|
|
181
|
+
'height': 1024,
|
|
182
|
+
'width': 1024,
|
|
183
|
+
'num_inference_steps': 50,
|
|
184
|
+
'guidance_scale': 9.0,
|
|
185
|
+
}
|
|
186
|
+
if self.eval_batch_size != 1:
|
|
187
|
+
logger.warning(
|
|
188
|
+
'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
|
|
189
|
+
)
|
|
190
|
+
self.eval_batch_size = 1
|
|
191
|
+
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
192
|
+
if self.eval_type == EvalType.CHECKPOINT:
|
|
193
|
+
self.generation_config = {
|
|
194
|
+
'max_tokens': 2048,
|
|
195
|
+
'do_sample': False,
|
|
196
|
+
'top_k': 50,
|
|
197
|
+
'top_p': 1.0,
|
|
198
|
+
'temperature': 1.0,
|
|
199
|
+
'n': 1,
|
|
200
|
+
}
|
|
201
|
+
elif self.eval_type == EvalType.SERVICE:
|
|
202
|
+
self.generation_config = {
|
|
203
|
+
'temperature': 0.0,
|
|
204
|
+
}
|
|
205
|
+
if isinstance(self.generation_config, dict):
|
|
206
|
+
self.generation_config = GenerateConfig.model_validate(self.generation_config)
|
|
207
|
+
|
|
208
|
+
# Set eval_batch_size to generation_config.batch_size
|
|
209
|
+
self.generation_config.batch_size = self.eval_batch_size
|
|
210
|
+
|
|
211
|
+
# Set default values for generation_config
|
|
212
|
+
if self.timeout is not None:
|
|
213
|
+
deprecated_warning(
|
|
214
|
+
logger,
|
|
215
|
+
'The `timeout` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.timeout` instead.'
|
|
216
|
+
)
|
|
217
|
+
self.generation_config.timeout = self.timeout
|
|
218
|
+
|
|
219
|
+
if self.stream is not None:
|
|
220
|
+
deprecated_warning(
|
|
221
|
+
logger,
|
|
222
|
+
'The `stream` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.stream` instead.'
|
|
223
|
+
)
|
|
224
|
+
self.generation_config.stream = self.stream
|
|
225
|
+
|
|
226
|
+
if self.generation_config.n is not None and self.generation_config.n > 1:
|
|
227
|
+
self.repeats = self.generation_config.n
|
|
228
|
+
self.generation_config.n = 1
|
|
229
|
+
deprecated_warning(
|
|
230
|
+
logger,
|
|
231
|
+
'The `n` parameter in generation_config is deprecated and will be removed in v2.0.0. Use `TaskConfig.repeats` instead.'
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def __init_default_model_args(self):
|
|
235
|
+
if self.model_args:
|
|
236
|
+
return
|
|
237
|
+
if self.model_task == ModelTask.TEXT_GENERATION:
|
|
238
|
+
if self.eval_type == EvalType.CHECKPOINT:
|
|
239
|
+
self.model_args = {
|
|
240
|
+
'revision': 'master',
|
|
241
|
+
'precision': 'torch.float16',
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
def __init_default_sandbox_config(self):
|
|
245
|
+
if not self.use_sandbox:
|
|
246
|
+
return
|
|
247
|
+
check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
|
|
80
248
|
|
|
81
|
-
|
|
82
|
-
|
|
249
|
+
if not self.sandbox_type:
|
|
250
|
+
self.sandbox_type = 'docker'
|
|
83
251
|
|
|
84
252
|
def update(self, other: Union['TaskConfig', dict]):
|
|
85
253
|
if isinstance(other, TaskConfig):
|
|
@@ -95,91 +263,16 @@ class TaskConfig:
|
|
|
95
263
|
except Exception as e:
|
|
96
264
|
logger.warning(f'Failed to dump overall task config: {e}')
|
|
97
265
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
@staticmethod
|
|
103
|
-
def from_yaml(yaml_file: str):
|
|
104
|
-
return TaskConfig.from_dict(yaml_to_dict(yaml_file))
|
|
105
|
-
|
|
106
|
-
@staticmethod
|
|
107
|
-
def from_dict(d: dict):
|
|
108
|
-
return TaskConfig(**d)
|
|
109
|
-
|
|
110
|
-
@staticmethod
|
|
111
|
-
def from_json(json_file: str):
|
|
112
|
-
return TaskConfig.from_dict(json_to_dict(json_file))
|
|
113
|
-
|
|
114
|
-
@staticmethod
|
|
115
|
-
def from_args(args: Namespace):
|
|
116
|
-
# Convert Namespace to a dictionary and filter out None values
|
|
117
|
-
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
118
|
-
|
|
119
|
-
if 'func' in args_dict:
|
|
120
|
-
del args_dict['func'] # Note: compat CLI arguments
|
|
121
|
-
|
|
122
|
-
return TaskConfig.from_dict(args_dict)
|
|
123
|
-
|
|
124
|
-
@staticmethod
|
|
125
|
-
def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
|
|
126
|
-
res_list = []
|
|
127
|
-
for task_name in tasks:
|
|
128
|
-
task = registry_tasks.get(task_name, None)
|
|
129
|
-
if task is None:
|
|
130
|
-
logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
|
|
131
|
-
continue
|
|
132
|
-
|
|
133
|
-
task.model = custom_model
|
|
134
|
-
task.model_args = custom_model.config
|
|
135
|
-
task.model_id = type(custom_model).__name__
|
|
136
|
-
res_list.append(task)
|
|
137
|
-
|
|
138
|
-
return res_list
|
|
139
|
-
|
|
140
|
-
@staticmethod
|
|
141
|
-
def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
|
|
142
|
-
"""
|
|
143
|
-
Register a new task (dataset) for evaluation.
|
|
144
|
-
|
|
145
|
-
Args:
|
|
146
|
-
name: str, the dataset name.
|
|
147
|
-
data_pattern: str, the data pattern for the task.
|
|
148
|
-
e.g. `mmlu`, `ceval`, `gsm8k`, ...
|
|
149
|
-
refer to task_config.list() for all available datasets.
|
|
150
|
-
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
151
|
-
then your specific custom dataset directory will be /path/to/data/{name}
|
|
152
|
-
subset_list: list, the subset list for the dataset.
|
|
153
|
-
e.g. ['middle_school_politics', 'operating_system']
|
|
154
|
-
refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
|
|
155
|
-
"""
|
|
156
|
-
available_datasets = list(registry_tasks.keys())
|
|
157
|
-
if data_pattern not in available_datasets:
|
|
158
|
-
logger.error(
|
|
159
|
-
f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
|
|
160
|
-
return
|
|
161
|
-
|
|
162
|
-
# Reuse the existing task config and update the datasets
|
|
163
|
-
pattern_config = registry_tasks[data_pattern]
|
|
164
|
-
|
|
165
|
-
custom_config = copy.deepcopy(pattern_config)
|
|
166
|
-
custom_config.datasets = [data_pattern]
|
|
167
|
-
custom_config.dataset_args = {data_pattern: {}}
|
|
168
|
-
custom_config.eval_type = EvalType.CHECKPOINT
|
|
169
|
-
|
|
170
|
-
if dataset_dir is not None:
|
|
171
|
-
custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
|
|
172
|
-
|
|
173
|
-
if subset_list is not None:
|
|
174
|
-
custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
|
|
175
|
-
|
|
176
|
-
registry_tasks.update({name: custom_config})
|
|
177
|
-
logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
|
|
178
|
-
|
|
266
|
+
def to_dict(self):
|
|
267
|
+
result = copy.copy(self.__dict__)
|
|
268
|
+
del result['api_key'] # Do not expose api_key in the config
|
|
179
269
|
|
|
180
|
-
|
|
270
|
+
if isinstance(self.model, (Model, ModelAPI)):
|
|
271
|
+
result['model'] = self.model.__class__.__name__
|
|
181
272
|
|
|
182
|
-
|
|
273
|
+
if isinstance(self.generation_config, GenerateConfig):
|
|
274
|
+
result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
|
|
275
|
+
return result
|
|
183
276
|
|
|
184
277
|
|
|
185
278
|
def parse_task_config(task_cfg) -> TaskConfig:
|
|
@@ -193,36 +286,14 @@ def parse_task_config(task_cfg) -> TaskConfig:
|
|
|
193
286
|
logger.info('Args: Task config is provided with CommandLine type.')
|
|
194
287
|
task_cfg = TaskConfig.from_args(task_cfg)
|
|
195
288
|
elif isinstance(task_cfg, str):
|
|
196
|
-
extension =
|
|
289
|
+
extension = os.path.splitext(task_cfg)[-1]
|
|
197
290
|
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
198
|
-
if extension in ['yaml', 'yml']:
|
|
291
|
+
if extension in ['.yaml', '.yml']:
|
|
199
292
|
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
200
|
-
elif extension == 'json':
|
|
293
|
+
elif extension == '.json':
|
|
201
294
|
task_cfg = TaskConfig.from_json(task_cfg)
|
|
202
295
|
else:
|
|
203
296
|
raise ValueError('Args: Unsupported file extension.')
|
|
204
297
|
else:
|
|
205
298
|
raise ValueError('Args: Please provide a valid task config.')
|
|
206
299
|
return task_cfg
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
class TempModel(CustomModel):
|
|
210
|
-
|
|
211
|
-
def __init__(self, config: dict):
|
|
212
|
-
super().__init__(config=config)
|
|
213
|
-
|
|
214
|
-
def predict(self, prompts: str, **kwargs):
|
|
215
|
-
return [item + ': response' for item in prompts]
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
if __name__ == '__main__':
|
|
219
|
-
model = TempModel(config={'model_id': 'test-swift-dummy-model'})
|
|
220
|
-
task_config = TaskConfig()
|
|
221
|
-
|
|
222
|
-
# Register a new task
|
|
223
|
-
TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
|
|
224
|
-
|
|
225
|
-
swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
|
|
226
|
-
for item in swift_eval_task:
|
|
227
|
-
print(item)
|
|
228
|
-
print()
|
evalscope/constants.py
CHANGED
|
@@ -1,12 +1,22 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
os.environ['MODELSCOPE_LOG_LEVEL'] = '40' # Set default log level to ERROR
|
|
6
|
+
|
|
2
7
|
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
|
3
8
|
from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
|
|
4
9
|
|
|
5
10
|
DEFAULT_WORK_DIR = './outputs'
|
|
6
11
|
DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
|
|
7
|
-
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
|
|
8
|
-
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
|
|
12
|
+
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub/models
|
|
13
|
+
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/hub/datasets
|
|
9
14
|
DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
|
|
15
|
+
DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
|
|
16
|
+
os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
|
|
17
|
+
) # ~/.cache/evalscope
|
|
18
|
+
IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
|
|
19
|
+
HEARTBEAT_INTERVAL_SEC = 60 # 60 seconds
|
|
10
20
|
|
|
11
21
|
|
|
12
22
|
class HubType:
|
|
@@ -36,47 +46,17 @@ class MetricsConstant:
|
|
|
36
46
|
]
|
|
37
47
|
|
|
38
48
|
|
|
39
|
-
class MetricMembers:
|
|
40
|
-
|
|
41
|
-
# Math accuracy metric
|
|
42
|
-
MATH_ACCURACY = 'math_accuracy'
|
|
43
|
-
|
|
44
|
-
# Code pass@k metric
|
|
45
|
-
CODE_PASS_K = 'code_pass_k'
|
|
46
|
-
|
|
47
|
-
# Code rouge metric
|
|
48
|
-
ROUGE = 'rouge'
|
|
49
|
-
|
|
50
|
-
# ELO rating system for pairwise comparison
|
|
51
|
-
ELO = 'elo'
|
|
52
|
-
|
|
53
|
-
# Pairwise comparison win/lose and tie(optional)
|
|
54
|
-
PAIRWISE = 'pairwise'
|
|
55
|
-
|
|
56
|
-
# Rating score for single model
|
|
57
|
-
SCORE = 'score'
|
|
58
|
-
|
|
59
|
-
|
|
60
49
|
class ArenaWinner:
|
|
61
50
|
|
|
62
51
|
MODEL_A = 'model_a'
|
|
63
|
-
|
|
64
52
|
MODEL_B = 'model_b'
|
|
65
|
-
|
|
66
53
|
TIE = 'tie'
|
|
67
|
-
|
|
68
54
|
TIE_BOTH_BAD = 'tie_both_bad'
|
|
69
|
-
|
|
70
55
|
UNKNOWN = 'unknown'
|
|
71
56
|
|
|
72
57
|
|
|
73
|
-
class ArenaMode:
|
|
74
|
-
SINGLE = 'single'
|
|
75
|
-
PAIRWISE = 'pairwise'
|
|
76
|
-
PAIRWISE_BASELINE = 'pairwise_baseline'
|
|
77
|
-
|
|
78
|
-
|
|
79
58
|
class AnswerKeys:
|
|
59
|
+
INDEX = 'index'
|
|
80
60
|
ANSWER_ID = 'answer_id'
|
|
81
61
|
RAW_INPUT = 'raw_input'
|
|
82
62
|
ORIGIN_PROMPT = 'origin_prompt'
|
|
@@ -85,58 +65,22 @@ class AnswerKeys:
|
|
|
85
65
|
CHOICES = 'choices'
|
|
86
66
|
|
|
87
67
|
|
|
88
|
-
class ReviewKeys:
|
|
89
|
-
REVIEW_ID = 'review_id'
|
|
90
|
-
REVIEWED = 'reviewed'
|
|
91
|
-
REVIEWER_SPEC = 'reviewer_spec'
|
|
92
|
-
REVIEW_TIME = 'review_time'
|
|
93
|
-
MESSAGE = 'message'
|
|
94
|
-
CONTENT = 'content'
|
|
95
|
-
GOLD = 'gold'
|
|
96
|
-
PRED = 'pred'
|
|
97
|
-
RESULT = 'result'
|
|
98
|
-
REVIEW = 'review'
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
class EvalConfigKeys:
|
|
102
|
-
CLASS_REF = 'ref'
|
|
103
|
-
CLASS_ARGS = 'args'
|
|
104
|
-
ENABLE = 'enable'
|
|
105
|
-
POSITION_BIAS_MITIGATION = 'position_bias_mitigation'
|
|
106
|
-
RANDOM_SEED = 'random_seed'
|
|
107
|
-
FN_COMPLETION_PARSER = 'fn_completion_parser'
|
|
108
|
-
COMPLETION_PARSER_KWARGS = 'completion_parser_kwargs'
|
|
109
|
-
OUTPUT_FILE = 'output_file'
|
|
110
|
-
MODEL_ID_OR_PATH = 'model_id_or_path'
|
|
111
|
-
MODEL_REVISION = 'revision'
|
|
112
|
-
GENERATION_CONFIG = 'generation_config'
|
|
113
|
-
PRECISION = 'precision'
|
|
114
|
-
TEMPLATE_TYPE = 'template_type'
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
class FnCompletionParser:
|
|
118
|
-
LMSYS_PARSER: str = 'lmsys_parser'
|
|
119
|
-
RANKING_PARSER: str = 'ranking_parser'
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
class PositionBiasMitigation:
|
|
123
|
-
NONE = 'none'
|
|
124
|
-
RANDOMIZE_ORDER = 'randomize_order'
|
|
125
|
-
SWAP_POSITION = 'swap_position'
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class EvalStage:
|
|
129
|
-
# Enums: `all`, `infer`, `review`
|
|
130
|
-
ALL = 'all'
|
|
131
|
-
INFER = 'infer'
|
|
132
|
-
REVIEW = 'review'
|
|
133
|
-
|
|
134
|
-
|
|
135
68
|
class EvalType:
|
|
136
69
|
|
|
137
70
|
CUSTOM = 'custom'
|
|
138
|
-
|
|
139
|
-
|
|
71
|
+
MOCK_LLM = 'mock_llm'
|
|
72
|
+
CHECKPOINT = 'llm_ckpt' # native model checkpoint
|
|
73
|
+
SERVICE = 'openai_api' # model service
|
|
74
|
+
TEXT2IMAGE = 'text2image' # image generation service
|
|
75
|
+
IMAGE_EDITING = 'image_editing' # image editing service
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class OutputType:
|
|
79
|
+
LOGITS = 'logits' # for logits output tasks
|
|
80
|
+
GENERATION = 'generation' # for text generation tasks and general tasks
|
|
81
|
+
MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
|
|
82
|
+
CONTINUOUS = 'continuous_logits' # for continuous tasks
|
|
83
|
+
IMAGE_GENERATION = 'image_generation' # for image generation tasks
|
|
140
84
|
|
|
141
85
|
|
|
142
86
|
class EvalBackend:
|
|
@@ -149,3 +93,55 @@ class EvalBackend:
|
|
|
149
93
|
|
|
150
94
|
class DataCollection:
|
|
151
95
|
NAME = 'data_collection'
|
|
96
|
+
INFO = 'collection_info'
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class JudgeStrategy:
|
|
100
|
+
AUTO = 'auto'
|
|
101
|
+
RULE = 'rule'
|
|
102
|
+
LLM = 'llm'
|
|
103
|
+
LLM_RECALL = 'llm_recall'
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class JudgeScoreType:
|
|
107
|
+
NUMERIC = 'numeric' # numeric score
|
|
108
|
+
PATTERN = 'pattern' # pattern matching score
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ModelTask:
|
|
112
|
+
TEXT_GENERATION = 'text_generation'
|
|
113
|
+
IMAGE_GENERATION = 'image_generation'
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class Tags:
|
|
117
|
+
KNOWLEDGE = 'Knowledge'
|
|
118
|
+
MULTIPLE_CHOICE = 'MCQ'
|
|
119
|
+
MATH = 'Math'
|
|
120
|
+
REASONING = 'Reasoning'
|
|
121
|
+
CODING = 'Coding'
|
|
122
|
+
CHINESE = 'Chinese'
|
|
123
|
+
COMMONSENSE = 'Commonsense'
|
|
124
|
+
QA = 'QA'
|
|
125
|
+
NER = 'NER'
|
|
126
|
+
READING_COMPREHENSION = 'ReadingComprehension'
|
|
127
|
+
CUSTOM = 'Custom'
|
|
128
|
+
INSTRUCTION_FOLLOWING = 'InstructionFollowing'
|
|
129
|
+
ARENA = 'Arena'
|
|
130
|
+
LONG_CONTEXT = 'LongContext'
|
|
131
|
+
RETRIEVAL = 'Retrieval'
|
|
132
|
+
FUNCTION_CALLING = 'FunctionCalling'
|
|
133
|
+
TEXT_TO_IMAGE = 'TextToImage'
|
|
134
|
+
IMAGE_EDITING = 'ImageEditing'
|
|
135
|
+
MULTI_MODAL = 'MultiModal'
|
|
136
|
+
MULTI_LINGUAL = 'MultiLingual'
|
|
137
|
+
MULTI_TURN = 'MultiTurn'
|
|
138
|
+
YES_NO = 'Yes/No'
|
|
139
|
+
HALLUCINATION = 'Hallucination'
|
|
140
|
+
MEDICAL = 'Medical'
|
|
141
|
+
AGENT = 'Agent'
|
|
142
|
+
MT = 'MachineTranslation'
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class FileConstants:
|
|
146
|
+
IMAGE_PATH = 'image_path'
|
|
147
|
+
ID = 'id'
|
evalscope/evaluator/__init__.py
CHANGED