PyPI - evalscope - Versions diffs - 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (606) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +11 -0
evalscope/api/benchmark/adapters/__init__.py +7 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +404 -0
evalscope/api/benchmark/meta.py +124 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +370 -0
evalscope/api/dataset/loader.py +266 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +382 -0
evalscope/api/evaluator/evaluator.py +61 -0
evalscope/api/evaluator/state.py +280 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +248 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +60 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/llm_judge_mixin.py +170 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +161 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/__init__.py +28 -0
evalscope/app/app.py +38 -0
evalscope/app/arguments.py +11 -0
evalscope/app/constants.py +22 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +53 -0
evalscope/app/ui/multi_model.py +353 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +220 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +195 -0
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +96 -0
evalscope/arguments.py +32 -9
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +10 -7
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +23 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
evalscope/backend/rag_eval/utils/embedding.py +125 -32
evalscope/backend/rag_eval/utils/llm.py +16 -16
evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
evalscope/benchmarks/__init__.py +17 -5
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/__init__.py +0 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +55 -0
evalscope/benchmarks/aime/aime25_adapter.py +181 -0
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arc/arc_adapter.py +34 -149
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
evalscope/benchmarks/arena_hard/utils.py +186 -0
evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
evalscope/benchmarks/bfcl/v3/generation.py +222 -0
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
evalscope/benchmarks/docmath/utils.py +219 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +155 -0
evalscope/benchmarks/drop/utils.py +156 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +175 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
evalscope/benchmarks/general_arena/utils.py +223 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
evalscope/benchmarks/gpqa/prompt.py +88 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +153 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
evalscope/benchmarks/ifeval/instructions.py +112 -68
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +43 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/race/race_adapter.py +33 -120
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/prompt.py +88 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
evalscope/benchmarks/super_gpqa/utils.py +86 -0
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
evalscope/benchmarks/tool_bench/utils.py +203 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +12 -2
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +10 -2
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +27 -3
evalscope/collections/sampler.py +12 -11
evalscope/collections/schema.py +13 -12
evalscope/config.py +218 -147
evalscope/constants.py +78 -82
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +334 -318
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +59 -3
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +211 -0
evalscope/metrics/math_parser.py +545 -0
evalscope/metrics/metric.py +611 -0
evalscope/metrics/metrics.py +112 -23
evalscope/metrics/rouge_metric.py +11 -13
evalscope/metrics/t2v_metrics/__init__.py +0 -0
evalscope/metrics/t2v_metrics/clipscore.py +14 -0
evalscope/metrics/t2v_metrics/constants.py +12 -0
evalscope/metrics/t2v_metrics/itmscore.py +14 -0
evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
evalscope/metrics/t2v_metrics/models/model.py +45 -0
evalscope/metrics/t2v_metrics/models/utils.py +25 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
evalscope/metrics/t2v_metrics/score.py +78 -0
evalscope/metrics/t2v_metrics/vqascore.py +14 -0
evalscope/models/__init__.py +23 -13
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +69 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +144 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +708 -0
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +103 -69
evalscope/perf/benchmark.py +114 -163
evalscope/perf/http_client.py +59 -89
evalscope/perf/main.py +91 -18
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +27 -7
evalscope/perf/plugin/api/custom_api.py +170 -57
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +214 -0
evalscope/perf/plugin/api/openai_api.py +120 -41
evalscope/perf/plugin/datasets/__init__.py +10 -6
evalscope/perf/plugin/datasets/base.py +43 -1
evalscope/perf/plugin/datasets/custom.py +22 -3
evalscope/perf/plugin/datasets/flickr8k.py +5 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +7 -3
evalscope/perf/plugin/datasets/longalpaca.py +7 -3
evalscope/perf/plugin/datasets/openqa.py +13 -14
evalscope/perf/plugin/datasets/random_dataset.py +67 -0
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +95 -55
evalscope/perf/utils/db_util.py +115 -78
evalscope/perf/utils/local_server.py +12 -47
evalscope/perf/utils/log_utils.py +63 -0
evalscope/perf/utils/rich_display.py +192 -0
evalscope/report/__init__.py +46 -3
evalscope/report/combinator.py +143 -32
evalscope/report/generator.py +74 -34
evalscope/report/report.py +238 -0
evalscope/run.py +71 -46
evalscope/summarizer.py +5 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +441 -0
evalscope/third_party/thinkbench/infer.py +130 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +48 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +82 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/chat_service.py +8 -6
evalscope/utils/deprecation_utils.py +53 -0
evalscope/utils/function_utils.py +266 -0
evalscope/utils/import_utils.py +154 -0
evalscope/utils/io_utils.py +336 -8
evalscope/utils/json_schema.py +231 -0
evalscope/utils/logger.py +121 -31
evalscope/utils/model_utils.py +57 -1
evalscope/utils/multi_choices.py +303 -0
evalscope/utils/ner.py +377 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
evalscope-1.2.0.dist-info/METADATA +553 -0
evalscope-1.2.0.dist-info/RECORD +628 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -76
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -291
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/collections/evaluator.py +0 -198
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/metrics/code_metric.py +0 -98
evalscope/metrics/named_metrics.py +0 -17
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
evalscope/models/base_adapter.py +0 -52
evalscope/models/chat_adapter.py +0 -138
evalscope/models/choice_adapter.py +0 -211
evalscope/models/custom/__init__.py +0 -3
evalscope/models/custom/custom_model.py +0 -53
evalscope/models/custom/dummy_model.py +0 -63
evalscope/models/custom_adapter.py +0 -67
evalscope/models/local_model.py +0 -74
evalscope/models/model.py +0 -229
evalscope/models/server_adapter.py +0 -111
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/report/app.py +0 -506
evalscope/report/utils.py +0 -133
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
evalscope/utils/utils.py +0 -301
evalscope-0.10.0.dist-info/METADATA +0 -565
evalscope-0.10.0.dist-info/RECORD +0 -286
tests/__init__.py +0 -1
tests/cli/__init__.py +0 -1
tests/cli/test_collection.py +0 -57
tests/cli/test_run.py +0 -165
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -101
tests/rag/test_clip_benchmark.py +0 -85
tests/rag/test_mteb.py +0 -138
tests/rag/test_ragas.py +0 -120
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -145
tests/swift/test_run_swift_vlm_eval.py +0 -127
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
tests/test_run_all.py +0 -12
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -60
{tests/rag → evalscope/api}/__init__.py +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/run.py CHANGED Viewed

@@ -2,19 +2,16 @@
 """
 Run evaluation for LLMs.
 """
-import os.path
+import os
 from argparse import Namespace
 from datetime import datetime
 from typing import TYPE_CHECKING, List, Optional, Union
 from evalscope.config import TaskConfig, parse_task_config
 from evalscope.constants import DataCollection, EvalBackend
-from evalscope.utils import seed_everything
 from evalscope.utils.io_utils import OutputsStructure
 from evalscope.utils.logger import configure_logging, get_logger
-if TYPE_CHECKING:
-    from evalscope.models import LocalModel
+from evalscope.utils.model_utils import seed_everything
 logger = get_logger()
@@ -39,25 +36,40 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
     configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
     if task_cfg.eval_backend != EvalBackend.NATIVE:
-        return run_non_native_backend(task_cfg, outputs)
+        result = run_non_native_backend(task_cfg, outputs)
     else:
-        return evaluate_model(task_cfg, outputs)
+        logger.info('Running with native backend')
+        result = evaluate_model(task_cfg, outputs)
+        logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
+        logger.info(f'Output directory: {outputs.outputs_dir}')
+    return result
 def setup_work_directory(task_cfg: TaskConfig, run_time: str):
     """Set the working directory for the task."""
+    # use cache
     if task_cfg.use_cache:
         task_cfg.work_dir = task_cfg.use_cache
         logger.info(f'Set resume from {task_cfg.work_dir}')
     # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
-    task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
+    else:
+        task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
     outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
+    # Unify the output directory structure
     if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
         task_cfg.eval_config['time_str'] = run_time
     elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
         task_cfg.eval_config['work_dir'] = task_cfg.work_dir
+    elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
+        from evalscope.backend.rag_eval import Tools
+        if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
+            task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
+        elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
+            task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
     return outputs
@@ -83,69 +95,82 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
 def get_backend_manager_class(eval_backend: EvalBackend):
     """Get the backend manager class based on the evaluation backend."""
     if eval_backend == EvalBackend.OPEN_COMPASS:
+        logger.info('Using OpenCompassBackendManager')
         from evalscope.backend.opencompass import OpenCompassBackendManager
         return OpenCompassBackendManager
     elif eval_backend == EvalBackend.VLM_EVAL_KIT:
+        logger.info('Using VLMEvalKitBackendManager')
         from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
         return VLMEvalKitBackendManager
     elif eval_backend == EvalBackend.RAG_EVAL:
+        logger.info('Using RAGEvalBackendManager')
         from evalscope.backend.rag_eval import RAGEvalBackendManager
         return RAGEvalBackendManager
     elif eval_backend == EvalBackend.THIRD_PARTY:
         raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
-def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
+def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
     """Evaluate the model based on the provided task configuration."""
-    from evalscope.models import get_local_model
+    from evalscope.api.evaluator import Evaluator
+    from evalscope.api.model import get_model_with_task_config
+    from evalscope.api.registry import get_benchmark
+    from evalscope.evaluator import DefaultEvaluator
+    from evalscope.report import gen_table
     # Initialize evaluator
     eval_results = {}
-    base_model = get_local_model(task_cfg)
-    evaluators = []
-    for dataset_name in task_cfg.datasets:
-        evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
+    # Initialize model
+    model = get_model_with_task_config(task_config=task_config)
+    # Initialize evaluators for each dataset
+    evaluators: List[Evaluator] = []
+    for dataset_name in task_config.datasets:
+        # Create evaluator for each dataset
+        benchmark = get_benchmark(dataset_name, task_config)
+        evaluator = DefaultEvaluator(
+            task_config=task_config,
+            model=model,
+            benchmark=benchmark,
+            outputs=outputs,
+        )
         evaluators.append(evaluator)
+        # Update task_config.dataset_args with benchmark metadata, except for DataCollection
+        if dataset_name != DataCollection.NAME:
+            task_config.dataset_args[dataset_name] = benchmark.to_dict()
     # dump task_cfg to outputs.configs_dir after creating evaluators
-    task_cfg.dump_yaml(outputs.configs_dir)
-    logger.info(task_cfg)
+    task_config.dump_yaml(outputs.configs_dir)
+    logger.info(task_config)
+    # Run evaluation for each evaluator
     for evaluator in evaluators:
-        res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
-        eval_results[dataset_name] = res_dict
+        res_dict = evaluator.eval()
+        eval_results[evaluator.benchmark.name] = res_dict
+    # Make overall report
+    try:
+        report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
+        logger.info(f'Overall report table: \n{report_table} \n')
+    except Exception:
+        logger.error('Failed to generate report table.')
+    # Clean up
+    if model is not None:
+        import gc
+        del model
+        del evaluators
+        gc.collect()
+        from evalscope.utils.import_utils import check_import
+        if check_import('torch', raise_warning=False):
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
     return eval_results
-def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
-    """Create an evaluator object for the specified dataset."""
-    from evalscope.benchmarks import Benchmark, BenchmarkMeta
-    from evalscope.evaluator import Evaluator
-    from evalscope.models import initialize_model_adapter
-    if dataset_name == DataCollection.NAME:
-        # EvaluatorCollection is a collection of evaluators
-        from evalscope.collections import EvaluatorCollection
-        return EvaluatorCollection(task_cfg, outputs)
-    benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
-    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
-    model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
-    # update task_cfg.dataset_args
-    task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
-    return Evaluator(
-        dataset_name_or_path=benchmark.dataset_id,
-        data_adapter=data_adapter,
-        model_adapter=model_adapter,
-        outputs=outputs,
-        task_cfg=task_cfg,
-    )
 def main():
     from evalscope.arguments import parse_args
     args = parse_args()

evalscope/summarizer.py CHANGED Viewed

@@ -7,8 +7,7 @@ from typing import List, Union
 from evalscope.config import TaskConfig, parse_task_config
 from evalscope.constants import EvalBackend
 from evalscope.report import gen_table
-from evalscope.utils import csv_to_list, get_latest_folder_path
-from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
+from evalscope.utils.io_utils import OutputsStructure, csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -30,7 +29,7 @@ class Summarizer:
             with open(report_file, 'r') as f:
                 res_list.append(json.load(f))
-        report_table: str = gen_table([reports_dir])
+        report_table: str = gen_table(reports_path_list=[reports_dir])
         logger.info(f'*** Report table ***\n{report_table}')
         return res_list
@@ -81,7 +80,7 @@ class Summarizer:
                 summary_file_path = summary_files[0]
                 # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
-                summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
+                summary_res: List[dict] = csv_to_list(summary_file_path)
                 final_res_list.extend(summary_res)
             elif eval_backend == EvalBackend.VLM_EVAL_KIT:
                 eval_config = Summarizer.parse_eval_config(candidate_task)
@@ -105,7 +104,8 @@ class Summarizer:
                             summary_res: dict = csv_to_list(summary_file_path)[0]
                         elif summary_file_path.endswith('json'):
                             summary_res: dict = json_to_dict(summary_file_path)
-                        file_name = os.path.basename(summary_file_path).split('.')[0]
+                        base_name = os.path.basename(summary_file_path)
+                        file_name = os.path.splitext(base_name)[0]
                         final_res_list.append({file_name: summary_res})
             elif eval_backend == EvalBackend.THIRD_PARTY:

evalscope/third_party/longbench_write/infer.py CHANGED Viewed

@@ -8,7 +8,7 @@ import random
 import torch
 from typing import List
-from evalscope.models.api import OpenaiApi
+from evalscope.third_party.longbench_write.tools.openai_api import OpenaiApi
 from evalscope.third_party.longbench_write.utils import count_words
 from evalscope.utils import get_logger

evalscope/third_party/thinkbench/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.third_party.thinkbench.eval import run_task

evalscope/third_party/thinkbench/eval.py ADDED Viewed

@@ -0,0 +1,441 @@
+import json
+import os
+import pandas as pd
+import plotly.graph_objects as go
+import re
+from collections import defaultdict
+from functools import lru_cache
+from modelscope import AutoTokenizer
+from plotly.subplots import make_subplots
+from tqdm.contrib.concurrent import thread_map
+from typing import List
+from evalscope.third_party.thinkbench.tools.llm import request_url
+from evalscope.third_party.thinkbench.tools.utils import extract_answer
+from evalscope.utils.io_utils import dict_to_json, dump_jsonl_data, json_to_dict, jsonl_to_list
+cur_path = os.path.dirname(os.path.abspath(__file__))
+class EvalThink:
+    def __init__(self, report_path, tokenizer_path, model_name, dataset_name, subsets, split_strategies='llm', judge_config=None):
+        self.report_path = report_path
+        self.reformat_template = open(os.path.join(cur_path, 'resources/reformat_template.txt'), 'r').read()
+        self.critique_template = open(os.path.join(cur_path, 'resources/critique_template.txt'), 'r').read()
+        self.switch_tokens = ['alternatively', 'but wait', 'let me reconsider', 'another way', 'another approach', 'another method', 'another angle']
+        self.subset_dict = defaultdict(lambda: defaultdict(list))
+        self.think_end_token = '</think>'
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        self.model_name = model_name
+        self.dataset_name = dataset_name
+        self.subsets = subsets
+        self.metrics = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens','token_efficiency', 'thought_num', 'accuracy']
+        self.split_strategies = split_strategies  # split by llm, keywords, separator
+        self.judge_config = judge_config
+        self.model_parse_file_path = os.path.join(self.report_path, 'answer_index.jsonl')
+        self.model_parse_dict = self.__init_parse_file()
+    def __init_parse_file(self):
+        if not os.path.exists(self.model_parse_file_path):
+           return {}
+        else:
+            list_file =  jsonl_to_list(self.model_parse_file_path)
+            # convert to dict prompt as key, answer_index as value
+            return {item['prompt']: item['answer_index'] for item in list_file}
+    def get_think_part(self, message: dict) -> str:
+        if 'reasoning_content' in message and message['reasoning_content']:
+            return message['reasoning_content']
+        else:
+            text = message['content']
+            last_think_end = text.rfind(self.think_end_token)
+            return text[:last_think_end]
+    @lru_cache(maxsize=None)
+    def cal_tokens(self, text: str):
+        return len(self.tokenizer.encode(text, add_special_tokens=False))
+    def process_choice(self, choice, problem):
+        think_part = self.get_think_part(choice['message'])
+        answer = choice['review']['gold']
+        tokens = self.cal_tokens(think_part)
+        switch_count = sum(think_part.lower().count(token) for token in self.switch_tokens)
+        useful_tokens = self.cal_tokens(self.get_first_correct(think_part, problem, answer))
+        reflection_tokens = tokens - useful_tokens
+        # score = choice['review']['result']
+        score = 0 if useful_tokens == 0 else 1
+        return tokens, switch_count, useful_tokens, reflection_tokens, score
+    def process_item(self, item):
+        problem = item['raw_input'].get('question') or item['raw_input'].get('problem') or ''
+        results = []
+        for choice in item['choices']:
+            results.append(self.process_choice(choice, problem))
+            break  # only process the first choice
+        total_tokens, switch_counts, useful_tokens, reflection_tokens, scores = zip(*results)
+        avg_tokens = sum(total_tokens) / len(total_tokens)
+        avg_thought_num = sum(switch_counts) / len(switch_counts)
+        avg_token_efficiency = sum(useful_tokens) / sum(total_tokens)
+        avg_accuracy = sum(scores) / len(scores)
+        avg_useful_tokens = sum(useful_tokens) / len(useful_tokens)
+        avg_reflection_tokens = sum(reflection_tokens) / len(reflection_tokens)
+        return avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens
+    def split_by_llm(self, response, problem) -> List[str]:
+        response = response.replace('\n', ' ') # remove newline characters
+        prompt = self.reformat_template.format(problem=problem, response=response)
+        llm_response = request_url(self.judge_config, prompt)
+        return llm_response.split('\n\n')
+    def split_by_keywords(self, text) -> List[str]:
+        pattern = r'(?=\b(?:{})\b)'.format('|'.join(map(re.escape, self.switch_tokens)))
+        segments = re.split(pattern, text)
+        # remove empty segments
+        segments = [segment.strip() for segment in segments if segment.strip()]
+        return segments if segments else [text]
+    def split_by_separator(self, text) -> List[str]:
+        return text.split('\n\n')
+    def get_answer_index(self, response: List[str], problem: str, answer: str) -> int:
+        tagged_response = ''
+        for sdx, step in enumerate(response):
+            tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
+        tagged_response = tagged_response.strip()
+        prompt = self.critique_template.format(problem=problem, answer=answer, tagged_response=tagged_response)
+        if prompt in self.model_parse_dict:
+            answer_index = self.model_parse_dict[prompt]
+        else:
+            llm_response = request_url(self.judge_config, prompt)
+            if not llm_response:
+                answer_index = -1
+            else:
+                answer_index = extract_answer(llm_response)
+            dump_jsonl_data({'prompt': prompt, 'response': llm_response, 'answer_index': answer_index},
+                            self.model_parse_file_path, dump_mode='append')
+        try:
+            answer_index = int(answer_index)
+        except Exception:
+            answer_index = -1
+        return answer_index
+    def get_first_correct(self, response: str, problem: str, answer: str) -> str:
+        if self.split_strategies == 'llm':
+            text_list = self.split_by_llm(response, problem)
+        elif self.split_strategies == 'keywords':
+            text_list = self.split_by_keywords(response)
+        else:
+            text_list = self.split_by_separator(response)
+        answer_index = self.get_answer_index(text_list, problem, answer)
+        if answer_index == -1:  # no correct answer found
+            first_correct = ''
+        else:
+            first_correct = '\n\n'.join(text_list[: answer_index])
+        return first_correct
+    def plot_metrics(self, results, output_dir):
+        # Change layout to 2x3
+        fig = make_subplots(rows=2, cols=3,
+                            subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
+                                          'Token Efficiency', 'Thought Num', 'Accuracy'),
+                            shared_xaxes=True, x_title='Subsets',
+                            vertical_spacing=0.1,  # Decrease vertical spacing between subplots
+                            horizontal_spacing=0.1)  # Decrease horizontal spacing between subplots
+        metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
+                        'token_efficiency', 'thought_num', 'accuracy']
+        for i, metric in enumerate(metrics_order, start=1):
+            y_values = [results[metric][subset] for subset in self.subsets]
+            # Determine row and column for 2x3 layout
+            row = (i - 1) // 3 + 1
+            col = (i - 1) % 3 + 1
+            fig.add_trace(
+                go.Scatter(x=list(range(len(self.subsets))), y=y_values,
+                           mode='lines+markers',
+                           name=metric.replace('_', ' ').title()),
+                row=row, col=col
+            )
+            # Add annotations for each data point
+            for j, y in enumerate(y_values):
+                fig.add_annotation(
+                    x=j,
+                    y=y,
+                    text=f'{y:.2f}',
+                    showarrow=False,
+                    yshift=10,
+                    row=row,
+                    col=col
+                )
+        fig.update_layout(
+            height=800,  # Adjust height for 2x3 layout
+            width=1200,   # Adjust width for 2x3 layout
+            title_text=f'Evaluation Metrics for {self.model_name} on {self.dataset_name}',
+            legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
+        )
+        for i in range(1, len(metrics_order) + 1):
+            row = (i - 1) // 3 + 1
+            col = (i - 1) % 3 + 1
+            fig.update_xaxes(
+                ticktext=self.subsets,
+                tickvals=list(range(len(self.subsets))),
+                row=row, col=col
+            )
+            fig.update_yaxes(title_text=metrics_order[i-1].replace('_', ' ').title(), row=row, col=col)
+        # Update y-axis ranges
+        fig.update_yaxes(range=[500, 5000], row=1, col=1)  # Reasoning Tokens
+        fig.update_yaxes(range=[0, 3000], row=1, col=2)  # First Correct Tokens
+        fig.update_yaxes(range=[0, 3000], row=1, col=3)  # Reflection Tokens
+        fig.update_yaxes(range=[0, 1], row=2, col=1)     # Token Efficiency
+        fig.update_yaxes(range=[0, 13], row=2, col=2)    # Thought Num
+        fig.update_yaxes(range=[0, 1], row=2, col=3)     # Accuracy
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, f'{self.model_name}_{self.dataset_name}_metrics.png')
+        fig.write_image(output_path)
+        print(f'save figure to: {output_path}')
+    def filter_df(self, df, response_len: int = 8000, count: int=10):
+        def is_valid_row(row):
+            return all(self.cal_tokens(choice['message']['content']) <= response_len for choice in row['choices'])
+        bools = df.apply(is_valid_row, axis=1)
+        return df[bools].head(count)
+    def evaluate(self, output_dir, max_tokens=8000, count=50, workers=128):
+        for subset in self.subsets:
+            review_path = os.path.join(self.report_path, 'reviews', self.model_name, f'{self.dataset_name}_{subset}.jsonl')
+            review_df = pd.read_json(review_path, lines=True)
+            review_df = self.filter_df(review_df, response_len=max_tokens, count=count)
+            results = thread_map(
+                self.process_item,
+                (item for _, item in review_df.iterrows()),
+                desc=f'Evaluating {subset}',
+                total=len(review_df),
+                max_workers=workers
+            )
+            avg_tokens, avg_thought_num, avg_token_efficiency, avg_accuracy, avg_useful_tokens, avg_reflection_tokens = zip(*results)
+            self.subset_dict[subset]['reasoning_tokens'] = sum(avg_tokens) / len(avg_tokens)
+            self.subset_dict[subset]['thought_num'] = sum(avg_thought_num) / len(avg_thought_num)
+            self.subset_dict[subset]['token_efficiency'] = sum(avg_token_efficiency) / len(avg_token_efficiency)
+            self.subset_dict[subset]['accuracy'] = sum(avg_accuracy) / len(avg_accuracy)
+            self.subset_dict[subset]['first_correct_tokens'] = sum(avg_useful_tokens) / len(avg_useful_tokens)
+            self.subset_dict[subset]['reflection_tokens'] = sum(avg_reflection_tokens) / len(avg_reflection_tokens)
+        results = {metric: {subset: self.subset_dict[subset][metric] for subset in self.subsets}
+                   for metric in self.metrics}
+        self.plot_metrics(results, output_dir)
+        # save results to json
+        dict_to_json(results, os.path.join(self.report_path, f'think_eval_results.json'))
+        return results
+def run_task(config, output_dir='outputs', max_tokens=8000, count=50, workers=128):
+    evaluator = EvalThink(**config,)
+    results = evaluator.evaluate(output_dir, max_tokens, count, workers)
+    print(results)
+def combine_results(configs: List[dict], output_path: str):
+    """
+    Combine evaluation results from multiple model configs into one plot.
+    All models' results for the same metric will be shown in the same subplot for easy comparison.
+    Args:
+        configs: List of model config dicts containing model_name and report_path
+    """
+    # Combine results from different runs
+    combined_results = defaultdict(lambda: defaultdict(dict))
+    for config in configs:
+        model_name = config['model_name']
+        report_path = config['report_path']
+        # Results is a dict with metric as key and subset as value
+        results = json_to_dict(os.path.join(report_path, f'think_eval_results.json'))
+        combined_results[model_name] = results
+    # Create a 2x3 subplot layout, one subplot per metric
+    fig = make_subplots(rows=2, cols=3,
+                       subplot_titles=('Reasoning Tokens', 'First Correct Tokens', 'Reflection Tokens',
+                                     'Token Efficiency', 'Thought Num', 'Accuracy'),
+                       shared_xaxes=True, x_title='Subsets',
+                       vertical_spacing=0.08,  # 减小垂直间距
+                       horizontal_spacing=0.05)  # 减小水平间距
+    metrics_order = ['reasoning_tokens', 'first_correct_tokens', 'reflection_tokens',
+                    'token_efficiency', 'thought_num', 'accuracy']
+    # Assign different colors for each model
+    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
+    # Plot each metric in a separate subplot
+    for i, metric in enumerate(metrics_order, start=1):
+        row = (i - 1) // 3 + 1
+        col = (i - 1) % 3 + 1
+        # Get subsets from first model (assuming all models have same subsets)
+        subsets = list(next(iter(combined_results.values()))[metric].keys())
+        # Add all models' data for this metric to the same subplot
+        for j, (model_name, results) in enumerate(combined_results.items()):
+            y_values = [results[metric][subset] for subset in subsets]
+            fig.add_trace(
+                go.Scatter(x=subsets, y=y_values,
+                          mode='lines+markers',
+                          name=model_name,  # Just model name since metrics are shown in subplot titles
+                          line=dict(color=colors[j % len(colors)]),
+                          showlegend=(i == 1)),  # Only show legend for first metric
+                row=row, col=col
+            )
+            # Add value annotations
+            for k, y in enumerate(y_values):
+                fig.add_annotation(
+                    x=subsets[k],
+                    y=y,
+                    text=f'{y:.2f}',
+                    showarrow=False,
+                    yshift=10,
+                    font=dict(size=12, color=colors[j % len(colors)]),
+                    row=row, col=col
+                )
+        # Update axis ranges and labels based on metric type
+        # if metric == 'token_efficiency':
+        #     fig.update_yaxes(range=[0.2, 0.7], row=row, col=col)
+        # elif metric == 'accuracy':
+        #     fig.update_yaxes(range=[0.8, 1], row=row, col=col)
+        fig.update_yaxes(title_text=metric.replace('_', ' ').title(), row=row, col=col)
+    # Update layout
+    fig.update_layout(
+        height=1000,  # 增加高度
+        width=1500,   # 增加宽度
+        title_text=f'Model Comparison Across Evaluation Metrics on MATH-500',
+        title=dict(font=dict(size=22)),  # 增大标题字号
+        font=dict(size=14),  # 增大整体字号
+        legend=dict(
+            orientation='h',
+            yanchor='bottom',
+            y=1.02,
+            xanchor='right',
+            x=1,
+            font=dict(size=14)  # 增大图例字号
+        )
+    )
+    # Save plot
+    os.makedirs('outputs', exist_ok=True)
+    fig.write_image(output_path)
+    print(f'Model comparison plot saved to {output_path}')
+    return combined_results
+judge_config = dict(
+    api_key='EMPTY',
+    base_url='http://0.0.0.0:8801/v1',
+    model_name='Qwen2.5-72B-Instruct',
+)
+distill_qwen_config = dict(
+    report_path = '../eval-scope/outputs/20250218_180219',
+    model_name = 'DeepSeek-R1-Distill-Qwen-7B',
+    tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+math_qwen_config = dict(
+    report_path = '../eval-scope/outputs/20250219_202358',
+    model_name = 'Qwen2.5-Math-7B-Instruct',
+    tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+r1_config = dict(
+    report_path = '../eval-scope/outputs/20250307_000404',
+    model_name = 'deepseek-r1',
+    tokenizer_path = 'deepseek-ai/DeepSeek-R1',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+qwq_preview_config = dict(
+    report_path = '../eval-scope/outputs/20250221_105911',
+    model_name = 'qwq-32b-preview',
+    tokenizer_path = 'Qwen/QwQ-32B-Preview',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+qwq_config = dict(
+    report_path = '../eval-scope/outputs/20250306_181550',
+    model_name = 'QwQ-32B',
+    tokenizer_path = 'Qwen/QwQ-32B',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+distill_qwen_32b = dict(
+    report_path = '../eval-scope/outputs/20250306_235951',
+    model_name = 'deepseek-r1-distill-qwen-32b',
+    tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+qwen3_32b_think = dict(
+    report_path = '../eval-scope/outputs/20250428_151817',
+    model_name = 'Qwen3-32B',
+    tokenizer_path = 'Qwen/Qwen3-32B',
+    dataset_name = 'math_500',
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    split_strategies='separator',
+    judge_config=judge_config
+)
+if __name__ == '__main__':
+    # run_task(distill_qwen_config, count=80)
+    # run_task(math_qwen_config)
+    # run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
+    # run_task(r1_config, max_tokens=20000, count=200, workers=128)
+    # run_task(qwq_config, max_tokens=20000, count=200, workers=128)
+    run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128)
+    # run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
+    # combine_results([qwq_config, r1_config, qwq_preview_config,  distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
+    # combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
+    # combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
+    combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')

evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl