evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, ContentText
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.import_utils import check_import
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
PROMPT_TEMPLATE = """
|
|
16
|
+
Translate the following {source_language} sentence into {target_language}:
|
|
17
|
+
|
|
18
|
+
{source_language}: {source_text}
|
|
19
|
+
{target_language}:
|
|
20
|
+
""".strip()
|
|
21
|
+
|
|
22
|
+
LANGUAGE_PAIRS = [
|
|
23
|
+
'en-ar_eg',
|
|
24
|
+
'en-ar_sa',
|
|
25
|
+
'en-bg_bg',
|
|
26
|
+
'en-bn_in',
|
|
27
|
+
'en-ca_es',
|
|
28
|
+
'en-cs_cz',
|
|
29
|
+
'en-da_dk',
|
|
30
|
+
'en-de_de',
|
|
31
|
+
'en-el_gr',
|
|
32
|
+
'en-es_mx',
|
|
33
|
+
'en-et_ee',
|
|
34
|
+
'en-fa_ir',
|
|
35
|
+
'en-fi_fi',
|
|
36
|
+
'en-fil_ph',
|
|
37
|
+
'en-fr_ca',
|
|
38
|
+
'en-fr_fr',
|
|
39
|
+
'en-gu_in',
|
|
40
|
+
'en-he_il',
|
|
41
|
+
'en-hi_in',
|
|
42
|
+
'en-hr_hr',
|
|
43
|
+
'en-hu_hu',
|
|
44
|
+
'en-id_id',
|
|
45
|
+
'en-is_is',
|
|
46
|
+
'en-it_it',
|
|
47
|
+
'en-ja_jp',
|
|
48
|
+
'en-kn_in',
|
|
49
|
+
'en-ko_kr',
|
|
50
|
+
'en-lt_lt',
|
|
51
|
+
'en-lv_lv',
|
|
52
|
+
'en-ml_in',
|
|
53
|
+
'en-mr_in',
|
|
54
|
+
'en-nl_nl',
|
|
55
|
+
'en-no_no',
|
|
56
|
+
'en-pa_in',
|
|
57
|
+
'en-pl_pl',
|
|
58
|
+
'en-pt_br',
|
|
59
|
+
'en-pt_pt',
|
|
60
|
+
'en-ro_ro',
|
|
61
|
+
'en-ru_ru',
|
|
62
|
+
'en-sk_sk',
|
|
63
|
+
'en-sl_si',
|
|
64
|
+
'en-sr_rs',
|
|
65
|
+
'en-sv_se',
|
|
66
|
+
'en-sw_ke',
|
|
67
|
+
'en-sw_tz',
|
|
68
|
+
'en-ta_in',
|
|
69
|
+
'en-te_in',
|
|
70
|
+
'en-th_th',
|
|
71
|
+
'en-tr_tr',
|
|
72
|
+
'en-uk_ua',
|
|
73
|
+
'en-ur_pk',
|
|
74
|
+
'en-vi_vn',
|
|
75
|
+
'en-zh_cn',
|
|
76
|
+
'en-zh_tw',
|
|
77
|
+
'en-zu_za',
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
LANGUAGE_BY_CODE = {
|
|
81
|
+
'ar_eg': 'arabic',
|
|
82
|
+
'ar_sa': 'arabic',
|
|
83
|
+
'bg_bg': 'bulgarian',
|
|
84
|
+
'bn_bd': 'bengali',
|
|
85
|
+
'bn_in': 'bengali',
|
|
86
|
+
'ca_es': 'catalan',
|
|
87
|
+
'cs_cz': 'czech',
|
|
88
|
+
'da_dk': 'danish',
|
|
89
|
+
'de_de': 'german',
|
|
90
|
+
'el_gr': 'greek',
|
|
91
|
+
'es_mx': 'spanish',
|
|
92
|
+
'et_ee': 'estonian',
|
|
93
|
+
'fa_ir': 'farsi',
|
|
94
|
+
'fi_fi': 'finnish',
|
|
95
|
+
'fil_ph': 'filipino',
|
|
96
|
+
'fr_ca': 'french',
|
|
97
|
+
'fr_fr': 'french',
|
|
98
|
+
'gu_in': 'gujarati',
|
|
99
|
+
'he_il': 'hebrew',
|
|
100
|
+
'hi_in': 'hindi',
|
|
101
|
+
'hr_hr': 'croatian',
|
|
102
|
+
'hu_hu': 'hungarian',
|
|
103
|
+
'id_id': 'indonesian',
|
|
104
|
+
'is_is': 'icelandic',
|
|
105
|
+
'it_it': 'italian',
|
|
106
|
+
'ja_jp': 'japanese',
|
|
107
|
+
'kn_in': 'kannada',
|
|
108
|
+
'ko_kr': 'korean',
|
|
109
|
+
'lt_lt': 'lithuanian',
|
|
110
|
+
'lv_lv': 'latvian',
|
|
111
|
+
'ml_in': 'malayalam',
|
|
112
|
+
'mr_in': 'marathi',
|
|
113
|
+
'nl_nl': 'dutch',
|
|
114
|
+
'no_no': 'norwegian',
|
|
115
|
+
'pa_in': 'punjabi',
|
|
116
|
+
'pl_pl': 'polish',
|
|
117
|
+
'pt_br': 'portuguese',
|
|
118
|
+
'pt_pt': 'portuguese',
|
|
119
|
+
'ro_ro': 'romanian',
|
|
120
|
+
'ru_ru': 'russian',
|
|
121
|
+
'sk_sk': 'slovak',
|
|
122
|
+
'sl_si': 'slovenian',
|
|
123
|
+
'sr_rs': 'serbian',
|
|
124
|
+
'sv_se': 'swedish',
|
|
125
|
+
'sw_ke': 'swahili',
|
|
126
|
+
'sw_tz': 'swahili',
|
|
127
|
+
'ta_in': 'tamil',
|
|
128
|
+
'te_in': 'telugu',
|
|
129
|
+
'th_th': 'thai',
|
|
130
|
+
'tr_tr': 'turkish',
|
|
131
|
+
'uk_ua': 'ukrainian',
|
|
132
|
+
'ur_pk': 'urdu',
|
|
133
|
+
'vi_vn': 'vietnamese',
|
|
134
|
+
'zh_cn': 'mandarin',
|
|
135
|
+
'zh_tw': 'mandarin',
|
|
136
|
+
'zu_za': 'zulu',
|
|
137
|
+
'en': 'english',
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@register_benchmark(
|
|
142
|
+
BenchmarkMeta(
|
|
143
|
+
name='wmt24pp',
|
|
144
|
+
pretty_name='WMT2024++',
|
|
145
|
+
dataset_id='extraordinarylab/wmt24pp',
|
|
146
|
+
tags=[Tags.MULTI_LINGUAL, Tags.MT],
|
|
147
|
+
description=(
|
|
148
|
+
'WMT2024 news translation benchmark supporting multiple language pairs. '
|
|
149
|
+
'Each subset represents a specific translation direction'
|
|
150
|
+
),
|
|
151
|
+
subset_list=LANGUAGE_PAIRS,
|
|
152
|
+
eval_split='test',
|
|
153
|
+
metric_list={
|
|
154
|
+
'bleu': {},
|
|
155
|
+
'bert_score': {
|
|
156
|
+
'model_id_or_path': 'AI-ModelScope/xlm-roberta-large',
|
|
157
|
+
'model_type': 'xlm-roberta-large'
|
|
158
|
+
},
|
|
159
|
+
'comet': {
|
|
160
|
+
'model_id_or_path': 'evalscope/wmt22-comet-da',
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
few_shot_num=0,
|
|
164
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
class WMT24PPAdapter(DefaultDataAdapter):
|
|
168
|
+
|
|
169
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
170
|
+
"""Initialize adapter and configure dataset subsets."""
|
|
171
|
+
super().__init__(**kwargs)
|
|
172
|
+
self.reformat_subset = True
|
|
173
|
+
self.use_batch_scoring = True # Enable batch scoring
|
|
174
|
+
|
|
175
|
+
if 'comet' in self.metric_list:
|
|
176
|
+
check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
|
|
177
|
+
|
|
178
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
179
|
+
"""
|
|
180
|
+
Convert a data record to a Sample object.
|
|
181
|
+
"""
|
|
182
|
+
source_text = str(record['source'])
|
|
183
|
+
target_text = str(record['target'])
|
|
184
|
+
language_pair = str(record['language_pair'])
|
|
185
|
+
source_language, target_language = language_pair.split('-')
|
|
186
|
+
|
|
187
|
+
# Format the generation prompt with the text
|
|
188
|
+
input_prompt = self.prompt_template.format(
|
|
189
|
+
source_text=source_text,
|
|
190
|
+
source_language=LANGUAGE_BY_CODE[source_language],
|
|
191
|
+
target_language=LANGUAGE_BY_CODE[target_language],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Create content list for the input
|
|
195
|
+
content_list = [ContentText(text=input_prompt)]
|
|
196
|
+
|
|
197
|
+
return Sample(
|
|
198
|
+
input=[ChatMessageUser(content=content_list)],
|
|
199
|
+
target=target_text,
|
|
200
|
+
subset_key=language_pair,
|
|
201
|
+
metadata={
|
|
202
|
+
'source_text': source_text,
|
|
203
|
+
'target_text': target_text,
|
|
204
|
+
'source_language': source_language,
|
|
205
|
+
'target_language': target_language,
|
|
206
|
+
},
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def match_score(
|
|
210
|
+
self,
|
|
211
|
+
original_prediction: str,
|
|
212
|
+
filtered_prediction: str,
|
|
213
|
+
reference: str,
|
|
214
|
+
task_state: TaskState,
|
|
215
|
+
) -> Score:
|
|
216
|
+
"""Compute per-sample translation metrics."""
|
|
217
|
+
# Create a Score object for the current sample
|
|
218
|
+
score = Score(
|
|
219
|
+
prediction=original_prediction,
|
|
220
|
+
extracted_prediction=filtered_prediction,
|
|
221
|
+
value={},
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# ---- BLEU ----
|
|
225
|
+
if 'bleu' in self.metric_list:
|
|
226
|
+
try:
|
|
227
|
+
from evalscope.metrics import bleu_ngram_one_sample
|
|
228
|
+
|
|
229
|
+
bleu_results = bleu_ngram_one_sample(filtered_prediction, reference)
|
|
230
|
+
score.value.update(bleu_results)
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.warning(f'[WMT24PPAdapter] BLEU single-sample calculation failed: {e}')
|
|
233
|
+
return score
|
|
234
|
+
|
|
235
|
+
def batch_match_score(
|
|
236
|
+
self,
|
|
237
|
+
original_predictions: List[str],
|
|
238
|
+
filtered_predictions: List[str],
|
|
239
|
+
references: List[str],
|
|
240
|
+
task_states: List[TaskState],
|
|
241
|
+
) -> List[Score]:
|
|
242
|
+
"""Compute batched translation metrics (BLEU, BERTScore, COMET)."""
|
|
243
|
+
scores: List[Score] = []
|
|
244
|
+
for i in range(len(original_predictions)):
|
|
245
|
+
score = Score(
|
|
246
|
+
extracted_prediction=filtered_predictions[i],
|
|
247
|
+
prediction=original_predictions[i],
|
|
248
|
+
value={},
|
|
249
|
+
)
|
|
250
|
+
scores.append(score)
|
|
251
|
+
|
|
252
|
+
# ---- BLEU (per-sample within batch) ----
|
|
253
|
+
if 'bleu' in self.metric_list:
|
|
254
|
+
try:
|
|
255
|
+
from evalscope.metrics import bleu_ngram_one_sample
|
|
256
|
+
|
|
257
|
+
for i in range(len(scores)):
|
|
258
|
+
bleu_results = bleu_ngram_one_sample(filtered_predictions[i], references[i])
|
|
259
|
+
scores[i].value.update(bleu_results)
|
|
260
|
+
except Exception as e:
|
|
261
|
+
logger.warning(f'[WMT24PPAdapter] BLEU batch calculation failed: {e}')
|
|
262
|
+
|
|
263
|
+
# ---- BERTScore ----
|
|
264
|
+
if 'bert_score' in self.metric_list:
|
|
265
|
+
try:
|
|
266
|
+
from evalscope.metrics.metric import BertScore
|
|
267
|
+
|
|
268
|
+
score_args = self.metric_list.get('bert_score', {})
|
|
269
|
+
bert_scorer = BertScore(**score_args)
|
|
270
|
+
bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
|
|
271
|
+
for i in range(len(scores)):
|
|
272
|
+
scores[i].value.update({'bert_score': bert_score_f1[i]})
|
|
273
|
+
except Exception as e:
|
|
274
|
+
logger.warning(f'[WMT24PPAdapter] BERTScore batch calculation failed: {e}')
|
|
275
|
+
|
|
276
|
+
# ---- COMET ----
|
|
277
|
+
if 'comet' in self.metric_list:
|
|
278
|
+
try:
|
|
279
|
+
from evalscope.metrics.metric import COMETScore
|
|
280
|
+
|
|
281
|
+
score_args = self.metric_list.get('comet', {})
|
|
282
|
+
comet_scorer = COMETScore(**score_args)
|
|
283
|
+
data = [{
|
|
284
|
+
'src': st.metadata.get('source_text'),
|
|
285
|
+
'mt': pred,
|
|
286
|
+
'ref': ref
|
|
287
|
+
} for pred, ref, st in zip(filtered_predictions, references, task_states)]
|
|
288
|
+
comet_scores = comet_scorer.apply(data)
|
|
289
|
+
for i in range(len(scores)):
|
|
290
|
+
scores[i].value.update({'comet': comet_scores[i]})
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.warning(f'[WMT24PPAdapter] COMET batch calculation failed: {e}')
|
|
293
|
+
|
|
294
|
+
return scores
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
# 定义提示模板
|
|
15
|
+
PROMPT_TEMPLATE = """{question}
|
|
16
|
+
\n\n\nLet's think step by step and give the final answer in curly braces,
|
|
17
|
+
like this: {{final answer}}"
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
SUBSET_LIST = ['default']
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_benchmark(
|
|
24
|
+
BenchmarkMeta(
|
|
25
|
+
name='zerobench',
|
|
26
|
+
pretty_name='ZeroBench',
|
|
27
|
+
dataset_id='evalscope/zerobench',
|
|
28
|
+
tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
|
|
29
|
+
description=
|
|
30
|
+
'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
|
|
31
|
+
subset_list=SUBSET_LIST,
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
eval_split='zerobench',
|
|
34
|
+
train_split='zerobench_subquestions',
|
|
35
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
class ZeroBenchAdapter(VisionLanguageAdapter):
|
|
39
|
+
|
|
40
|
+
def __init__(self, *args, **kwargs):
|
|
41
|
+
super().__init__(*args, **kwargs)
|
|
42
|
+
|
|
43
|
+
self._use_llm_judge = True
|
|
44
|
+
|
|
45
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
46
|
+
question = record['question_text']
|
|
47
|
+
content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
|
|
48
|
+
image = record['question_images_decoded']
|
|
49
|
+
if len(image) > 0:
|
|
50
|
+
for img in image:
|
|
51
|
+
# Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
|
|
52
|
+
processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
|
|
53
|
+
image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
|
|
54
|
+
content_list.append(ContentImage(image=image_base64))
|
|
55
|
+
|
|
56
|
+
metadata = {
|
|
57
|
+
'question_id': record['question_id'],
|
|
58
|
+
'question_images': record['question_images'],
|
|
59
|
+
'image_attribution': record['image_attribution']
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return Sample(
|
|
63
|
+
input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
|
|
64
|
+
)
|
evalscope/cli/cli.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
4
|
|
|
5
|
+
from evalscope import __version__
|
|
5
6
|
from evalscope.cli.start_app import StartAppCMD
|
|
6
7
|
from evalscope.cli.start_eval import EvalCMD
|
|
7
8
|
from evalscope.cli.start_perf import PerfBenchCMD
|
|
@@ -9,6 +10,7 @@ from evalscope.cli.start_perf import PerfBenchCMD
|
|
|
9
10
|
|
|
10
11
|
def run_cmd():
|
|
11
12
|
parser = argparse.ArgumentParser('EvalScope Command Line tool', usage='evalscope <command> [<args>]')
|
|
13
|
+
parser.add_argument('-v', '--version', action='version', version=f'evalscope {__version__}')
|
|
12
14
|
subparsers = parser.add_subparsers(help='EvalScope command line helper.')
|
|
13
15
|
|
|
14
16
|
PerfBenchCMD.define_args(subparsers)
|
evalscope/cli/start_app.py
CHANGED
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
from argparse import ArgumentParser
|
|
4
4
|
|
|
5
5
|
from evalscope.cli.base import CLICommand
|
|
6
|
-
from evalscope.report.app import create_app
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
def subparser_func(args):
|
|
@@ -22,8 +21,19 @@ class StartAppCMD(CLICommand):
|
|
|
22
21
|
def define_args(parsers: ArgumentParser):
|
|
23
22
|
""" define args for create pipeline template command.
|
|
24
23
|
"""
|
|
24
|
+
from evalscope.app import add_argument
|
|
25
|
+
|
|
25
26
|
parser = parsers.add_parser(StartAppCMD.name)
|
|
27
|
+
add_argument(parser)
|
|
26
28
|
parser.set_defaults(func=subparser_func)
|
|
27
29
|
|
|
28
30
|
def execute(self):
|
|
29
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.app import create_app
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import create_app from evalscope.app, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[app]'`."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
create_app(self.args)
|
evalscope/cli/start_eval.py
CHANGED
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from argparse import ArgumentParser
|
|
4
3
|
|
|
5
|
-
from evalscope.arguments import add_argument
|
|
6
4
|
from evalscope.cli.base import CLICommand
|
|
7
|
-
from evalscope.run import run_task
|
|
8
5
|
|
|
9
6
|
|
|
10
7
|
def subparser_func(args):
|
|
@@ -23,9 +20,13 @@ class EvalCMD(CLICommand):
|
|
|
23
20
|
def define_args(parsers: ArgumentParser):
|
|
24
21
|
""" define args for create pipeline template command.
|
|
25
22
|
"""
|
|
23
|
+
from evalscope.arguments import add_argument
|
|
24
|
+
|
|
26
25
|
parser = parsers.add_parser(EvalCMD.name)
|
|
27
26
|
add_argument(parser)
|
|
28
27
|
parser.set_defaults(func=subparser_func)
|
|
29
28
|
|
|
30
29
|
def execute(self):
|
|
30
|
+
from evalscope.run import run_task
|
|
31
|
+
|
|
31
32
|
run_task(self.args)
|
evalscope/cli/start_perf.py
CHANGED
|
@@ -3,8 +3,6 @@ import os
|
|
|
3
3
|
from argparse import ArgumentParser
|
|
4
4
|
|
|
5
5
|
from evalscope.cli.base import CLICommand
|
|
6
|
-
from evalscope.perf.arguments import add_argument
|
|
7
|
-
from evalscope.perf.main import run_perf_benchmark
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
def subparser_func(args):
|
|
@@ -23,9 +21,19 @@ class PerfBenchCMD(CLICommand):
|
|
|
23
21
|
def define_args(parsers: ArgumentParser):
|
|
24
22
|
""" define args for create pipeline template command.
|
|
25
23
|
"""
|
|
24
|
+
from evalscope.perf.arguments import add_argument
|
|
25
|
+
|
|
26
26
|
parser = parsers.add_parser(PerfBenchCMD.name)
|
|
27
27
|
add_argument(parser)
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[perf]'`."
|
|
37
|
+
)
|
|
38
|
+
|
|
31
39
|
run_perf_benchmark(self.args)
|
evalscope/cli/start_server.py
CHANGED
|
@@ -25,14 +25,16 @@ def add_perf_args(parser):
|
|
|
25
25
|
'--logdir',
|
|
26
26
|
required=True,
|
|
27
27
|
type=str,
|
|
28
|
-
help='The monitor log save dir, tensorboard start at this path for display!'
|
|
28
|
+
help='The monitor log save dir, tensorboard start at this path for display!'
|
|
29
|
+
)
|
|
29
30
|
parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
|
|
30
31
|
parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
def async_run_command_with_popen(cmd):
|
|
34
35
|
sub_process = subprocess.Popen(
|
|
35
|
-
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8'
|
|
36
|
+
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8'
|
|
37
|
+
)
|
|
36
38
|
return sub_process
|
|
37
39
|
|
|
38
40
|
|
|
@@ -61,7 +63,8 @@ def start_server(args):
|
|
|
61
63
|
bufsize=1,
|
|
62
64
|
shell=True,
|
|
63
65
|
universal_newlines=True,
|
|
64
|
-
encoding='utf8'
|
|
66
|
+
encoding='utf8'
|
|
67
|
+
)
|
|
65
68
|
|
|
66
69
|
os.set_blocking(sub_process.stdout.fileno(), False)
|
|
67
70
|
return sub_process
|
|
@@ -1,3 +1,27 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .sampler import DatasetEntry, StratifiedSampler, UniformSampler, WeightedSampler
|
|
8
|
+
from .schema import CollectionSchema, DatasetInfo
|
|
9
|
+
|
|
10
|
+
else:
|
|
11
|
+
_import_structure = {
|
|
12
|
+
'sampler': ['StratifiedSampler', 'UniformSampler', 'WeightedSampler', 'DatasetEntry'],
|
|
13
|
+
'schema': [
|
|
14
|
+
'CollectionSchema',
|
|
15
|
+
'DatasetInfo',
|
|
16
|
+
],
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
|
|
21
|
+
sys.modules[__name__] = _LazyModule(
|
|
22
|
+
__name__,
|
|
23
|
+
globals()['__file__'],
|
|
24
|
+
_import_structure,
|
|
25
|
+
module_spec=__spec__,
|
|
26
|
+
extra_objects={},
|
|
27
|
+
)
|
evalscope/collections/sampler.py
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
import random
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
from typing import List, Optional
|
|
6
6
|
|
|
7
7
|
from evalscope.collections.schema import CollectionSchema, DatasetInfo
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
class DatasetEntry:
|
|
10
|
+
class DatasetEntry(BaseModel):
|
|
12
11
|
index: int = 0
|
|
13
|
-
prompt: dict =
|
|
14
|
-
tags: List[str] =
|
|
15
|
-
categories: List[str] =
|
|
12
|
+
prompt: dict = Field(default_factory=dict)
|
|
13
|
+
tags: List[str] = Field(default_factory=list)
|
|
14
|
+
categories: List[str] = Field(default_factory=list)
|
|
16
15
|
task_type: str = ''
|
|
17
16
|
weight: float = 0.0
|
|
18
17
|
dataset_name: str = ''
|
|
@@ -33,25 +32,27 @@ class Sampler(ABC):
|
|
|
33
32
|
all_data = []
|
|
34
33
|
data_dict = dataset.get_data()
|
|
35
34
|
for subset_name, subset_data in data_dict.items():
|
|
36
|
-
for
|
|
35
|
+
for sample in subset_data:
|
|
37
36
|
all_data.append(
|
|
38
37
|
DatasetEntry(
|
|
39
|
-
prompt=
|
|
38
|
+
prompt=sample.model_dump(exclude_none=True),
|
|
40
39
|
tags=dataset.tags,
|
|
41
40
|
categories=dataset.hierarchy,
|
|
42
41
|
task_type=dataset.task_type,
|
|
43
42
|
weight=dataset.weight,
|
|
44
43
|
dataset_name=dataset.name,
|
|
45
44
|
subset_name=subset_name,
|
|
46
|
-
)
|
|
47
|
-
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
count = min(count, len(all_data)) # avoid sampling more than the dataset size
|
|
48
|
+
sampled_data = random.sample(all_data, k=count)
|
|
48
49
|
return sampled_data
|
|
49
50
|
|
|
50
51
|
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
|
51
52
|
result = []
|
|
52
53
|
for i, entry in enumerate(all_data):
|
|
53
54
|
entry.index = i
|
|
54
|
-
result.append(
|
|
55
|
+
result.append(entry.model_dump())
|
|
55
56
|
return result
|
|
56
57
|
|
|
57
58
|
|
evalscope/collections/schema.py
CHANGED
|
@@ -3,6 +3,10 @@ import json
|
|
|
3
3
|
from dataclasses import asdict, dataclass, field
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.api.dataset import DatasetDict
|
|
7
|
+
from evalscope.api.registry import get_benchmark
|
|
8
|
+
from evalscope.config import TaskConfig
|
|
9
|
+
|
|
6
10
|
|
|
7
11
|
@dataclass
|
|
8
12
|
class DatasetInfo:
|
|
@@ -13,16 +17,11 @@ class DatasetInfo:
|
|
|
13
17
|
args: dict = field(default_factory=dict)
|
|
14
18
|
hierarchy: List[str] = field(default_factory=list)
|
|
15
19
|
|
|
16
|
-
def get_data(self) ->
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
22
|
-
data_dict = data_adapter.load(
|
|
23
|
-
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
24
|
-
prompts = data_adapter.gen_prompts(data_dict)
|
|
25
|
-
return prompts
|
|
20
|
+
def get_data(self) -> DatasetDict:
|
|
21
|
+
dataset_args = {self.name: self.args}
|
|
22
|
+
benchmark_meta = get_benchmark(self.name, config=TaskConfig(dataset_args=dataset_args))
|
|
23
|
+
data_dict = benchmark_meta.load_dataset()
|
|
24
|
+
return data_dict
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
def flatten_weight(collection: 'CollectionSchema', base_weight=1):
|
|
@@ -112,8 +111,10 @@ if __name__ == '__main__':
|
|
|
112
111
|
]),
|
|
113
112
|
CollectionSchema(
|
|
114
113
|
name='chinese',
|
|
115
|
-
datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})]
|
|
116
|
-
|
|
114
|
+
datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})]
|
|
115
|
+
)
|
|
116
|
+
]
|
|
117
|
+
)
|
|
117
118
|
print(schema)
|
|
118
119
|
print(schema.flatten())
|
|
119
120
|
schema.dump_json('outputs/schema.json')
|