PyPI - evalscope - Versions diffs - 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (606) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +11 -0
evalscope/api/benchmark/adapters/__init__.py +7 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +404 -0
evalscope/api/benchmark/meta.py +124 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +370 -0
evalscope/api/dataset/loader.py +266 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +382 -0
evalscope/api/evaluator/evaluator.py +61 -0
evalscope/api/evaluator/state.py +280 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +248 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +60 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/llm_judge_mixin.py +170 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +161 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/__init__.py +28 -0
evalscope/app/app.py +38 -0
evalscope/app/arguments.py +11 -0
evalscope/app/constants.py +22 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +53 -0
evalscope/app/ui/multi_model.py +353 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +220 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +195 -0
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +96 -0
evalscope/arguments.py +32 -9
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +10 -7
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +23 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
evalscope/backend/rag_eval/utils/embedding.py +125 -32
evalscope/backend/rag_eval/utils/llm.py +16 -16
evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
evalscope/benchmarks/__init__.py +17 -5
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/__init__.py +0 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +55 -0
evalscope/benchmarks/aime/aime25_adapter.py +181 -0
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arc/arc_adapter.py +34 -149
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
evalscope/benchmarks/arena_hard/utils.py +186 -0
evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
evalscope/benchmarks/bfcl/v3/generation.py +222 -0
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
evalscope/benchmarks/docmath/utils.py +219 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +155 -0
evalscope/benchmarks/drop/utils.py +156 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +175 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
evalscope/benchmarks/general_arena/utils.py +223 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
evalscope/benchmarks/gpqa/prompt.py +88 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +153 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
evalscope/benchmarks/ifeval/instructions.py +112 -68
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +43 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/race/race_adapter.py +33 -120
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/prompt.py +88 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
evalscope/benchmarks/super_gpqa/utils.py +86 -0
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
evalscope/benchmarks/tool_bench/utils.py +203 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +12 -2
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +10 -2
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +27 -3
evalscope/collections/sampler.py +12 -11
evalscope/collections/schema.py +13 -12
evalscope/config.py +218 -147
evalscope/constants.py +78 -82
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +334 -318
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +59 -3
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +211 -0
evalscope/metrics/math_parser.py +545 -0
evalscope/metrics/metric.py +611 -0
evalscope/metrics/metrics.py +112 -23
evalscope/metrics/rouge_metric.py +11 -13
evalscope/metrics/t2v_metrics/__init__.py +0 -0
evalscope/metrics/t2v_metrics/clipscore.py +14 -0
evalscope/metrics/t2v_metrics/constants.py +12 -0
evalscope/metrics/t2v_metrics/itmscore.py +14 -0
evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
evalscope/metrics/t2v_metrics/models/model.py +45 -0
evalscope/metrics/t2v_metrics/models/utils.py +25 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
evalscope/metrics/t2v_metrics/score.py +78 -0
evalscope/metrics/t2v_metrics/vqascore.py +14 -0
evalscope/models/__init__.py +23 -13
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +69 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +144 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +708 -0
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +103 -69
evalscope/perf/benchmark.py +114 -163
evalscope/perf/http_client.py +59 -89
evalscope/perf/main.py +91 -18
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +27 -7
evalscope/perf/plugin/api/custom_api.py +170 -57
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +214 -0
evalscope/perf/plugin/api/openai_api.py +120 -41
evalscope/perf/plugin/datasets/__init__.py +10 -6
evalscope/perf/plugin/datasets/base.py +43 -1
evalscope/perf/plugin/datasets/custom.py +22 -3
evalscope/perf/plugin/datasets/flickr8k.py +5 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +7 -3
evalscope/perf/plugin/datasets/longalpaca.py +7 -3
evalscope/perf/plugin/datasets/openqa.py +13 -14
evalscope/perf/plugin/datasets/random_dataset.py +67 -0
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +95 -55
evalscope/perf/utils/db_util.py +115 -78
evalscope/perf/utils/local_server.py +12 -47
evalscope/perf/utils/log_utils.py +63 -0
evalscope/perf/utils/rich_display.py +192 -0
evalscope/report/__init__.py +46 -3
evalscope/report/combinator.py +143 -32
evalscope/report/generator.py +74 -34
evalscope/report/report.py +238 -0
evalscope/run.py +71 -46
evalscope/summarizer.py +5 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +441 -0
evalscope/third_party/thinkbench/infer.py +130 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +48 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +82 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/chat_service.py +8 -6
evalscope/utils/deprecation_utils.py +53 -0
evalscope/utils/function_utils.py +266 -0
evalscope/utils/import_utils.py +154 -0
evalscope/utils/io_utils.py +336 -8
evalscope/utils/json_schema.py +231 -0
evalscope/utils/logger.py +121 -31
evalscope/utils/model_utils.py +57 -1
evalscope/utils/multi_choices.py +303 -0
evalscope/utils/ner.py +377 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
evalscope-1.2.0.dist-info/METADATA +553 -0
evalscope-1.2.0.dist-info/RECORD +628 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -76
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -291
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/collections/evaluator.py +0 -198
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/metrics/code_metric.py +0 -98
evalscope/metrics/named_metrics.py +0 -17
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
evalscope/models/base_adapter.py +0 -52
evalscope/models/chat_adapter.py +0 -138
evalscope/models/choice_adapter.py +0 -211
evalscope/models/custom/__init__.py +0 -3
evalscope/models/custom/custom_model.py +0 -53
evalscope/models/custom/dummy_model.py +0 -63
evalscope/models/custom_adapter.py +0 -67
evalscope/models/local_model.py +0 -74
evalscope/models/model.py +0 -229
evalscope/models/server_adapter.py +0 -111
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/report/app.py +0 -506
evalscope/report/utils.py +0 -133
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
evalscope/utils/utils.py +0 -301
evalscope-0.10.0.dist-info/METADATA +0 -565
evalscope-0.10.0.dist-info/RECORD +0 -286
tests/__init__.py +0 -1
tests/cli/__init__.py +0 -1
tests/cli/test_collection.py +0 -57
tests/cli/test_run.py +0 -165
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -101
tests/rag/test_clip_benchmark.py +0 -85
tests/rag/test_mteb.py +0 -138
tests/rag/test_ragas.py +0 -120
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -145
tests/swift/test_run_swift_vlm_eval.py +0 -127
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
tests/test_run_all.py +0 -12
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -60
{tests/rag → evalscope/api}/__init__.py +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/third_party/thinkbench/infer.py ADDED Viewed

@@ -0,0 +1,130 @@
+import os
+from evalscope import TaskConfig, run_task
+DASHSCOPE_API_KEY = 'sk-723135c241x'
+def eval_distill_qwen():
+    model_name = 'DeepSeek-R1-Distill-Qwen-7B'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='http://0.0.0.0:8801/v1/chat/completions',
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=32,
+        generation_config={
+            'max_tokens': 20000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+    )
+    run_task(task_config)
+def eval_math_qwen():
+    model_name = 'Qwen2.5-Math-7B-Instruct'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='http://0.0.0.0:8801/v1/chat/completions',
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=32,
+        generation_config={
+            'max_tokens': 3000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 3,
+        },
+    )
+    run_task(task_config)
+def eval_r1():
+    model_name = 'deepseek-r1'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
+        api_key=DASHSCOPE_API_KEY,
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=8,
+        generation_config={
+            'max_tokens': 20000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+        use_cache='./outputs/20250307_000404',
+        timeout=36000,
+        stream=True
+    )
+    run_task(task_config)
+def eval_distill_32b():
+    model_name = 'deepseek-r1-distill-qwen-32b'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
+        api_key=DASHSCOPE_API_KEY,
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=5,
+        generation_config={
+            'max_tokens': 12000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+        use_cache='./outputs/20250306_235951',
+        timeout=32000,
+        stream=True
+    )
+    run_task(task_config)
+def eval_qwq():
+    model_name = 'qwq-32b-preview'
+    dataset_name = 'math_500'
+    subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5']
+    task_config = TaskConfig(
+        api_url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
+        api_key=os.environ['DASHSCOPE_API_KEY'],
+        model=model_name,
+        eval_type='service',
+        datasets=[dataset_name],
+        dataset_args={dataset_name: {'few_shot_num': 0, 'subset_list': subsets}},
+        eval_batch_size=32,
+        generation_config={
+            'max_tokens': 8000,  # avoid exceed max length
+            'temperature': 0.6,
+            'top_p': 0.95,
+            'n': 1,
+        },
+        use_cache='./outputs/20250221_105911'
+    )
+    run_task(task_config)
+if __name__ == '__main__':
+    # eval_distill_qwen()
+    # eval_math_qwen()
+    eval_r1()
+    # eval_qwq()
+    # eval_distill_32b()

evalscope/third_party/thinkbench/resources/critique_template.txt ADDED Viewed

@@ -0,0 +1,17 @@
+The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
+[Math Problem]
+{problem}
+[Correct Answer]
+{answer}
+[Solution]
+{tagged_response}
+Your task is to review and critique the solution paragraph by paragraph. Once you identify an correct answer in a paragraph, return the index of the paragraph where the earliest correct answer occurs. Otherwise, return the index of -1 (which typically denotes "not found").
+Please put your final answer (i.e., the index) in \boxed{{}}.

evalscope/third_party/thinkbench/resources/reformat_template.txt ADDED Viewed

@@ -0,0 +1,31 @@
+I will present you with a solution to a math problem. Unfortunately, the solution lacks proper paragraphing, making it hard to read. Your task is to improve readability by reformatting the solution into well-structured paragraphs. Follow these specific guidelines:
+* Insert \n\n for paragraph breaks within the original solution. Do **NOT** alter any content of the original solution (the only exception is for itemized lists; see below).
+  - Each paragraph should represent a distinct, concise reasoning step that logically advances the solution.
+  - Reasoning steps can include case discussions, formula simplifications, or formula derivations. Each of these should be treated as an individual reasoning step and paragraphed accordingly.
+  - If an introductory analysis exists in the original solution, treat it as an initial reasoning step and place it as the first paragraph.
+  - Do **NOT** place any mathematical formulas in their own separate paragraphs; instead, include them within the same paragraph as the preceding text to form a cohesive reasoning step.
+* For any itemized lists (ordered or unordered), convert them into a written format, such as "First/Second/Third." This is the **ONLY** content modification allowed.
+* Avoid making paragraphs too lengthy, as long paragraphs might contain multiple reasoning steps that should be paragraphed separately.
+* Disregard the accuracy of the solution content. Do **NOT** alter any of the original solution's content; focus solely on structuring it into logical, readable paragraphs.
+* Reply with the reformatted solution directly.
+--------------------------------------------------
+Here is the math problem, and the solution that needs to be reformatted:
+[Math Problem]
+{problem}
+[Solution]
+{response}

evalscope/third_party/thinkbench/tools/__init__.py ADDED Viewed

File without changes

evalscope/third_party/thinkbench/tools/llm.py ADDED Viewed

@@ -0,0 +1,48 @@
+import os
+from openai import OpenAI
+def request_url(llm_config, content):
+    try:
+        client = OpenAI(
+            api_key=llm_config['api_key'],
+            base_url=llm_config['base_url'],
+        )
+        completion = client.chat.completions.create(
+            model=llm_config['model_name'],
+            messages=[{'role': 'user', 'content': content}]
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        print(e)
+        return None
+def request_qwen(content):
+    try:
+        client = OpenAI(
+            api_key=os.getenv('DASHSCOPE_API_KEY'),
+            base_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
+        )
+        completion = client.chat.completions.create(
+            model='qwen-max',
+            messages=[{'role': 'user', 'content': content}]
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        print(e)
+def request_local(content):
+    try:
+        client = OpenAI(
+            api_key='EMPTY',
+            base_url='http://0.0.0.0:8801/v1',
+        )
+        completion = client.chat.completions.create(
+            model='Qwen2.5-72B-Instruct',
+            messages=[{'role': 'user', 'content': content}]
+        )
+        return completion.choices[0].message.content
+    except Exception as e:
+        print(e)

evalscope/third_party/thinkbench/tools/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import re
+def extract_answer(solution_text: str):
+    boxed_pattern = r'\\boxed\{([^}]*)\}'
+    matches = re.findall(boxed_pattern, solution_text)
+    if matches:
+        last_boxed_content = matches[-1]
+        number_pattern = r'-?\d+'
+        number_matches = re.findall(number_pattern, last_boxed_content)
+        if number_matches:
+            return number_matches[-1].strip()
+    return None

evalscope/third_party/toolbench_static/llm/swift_infer.py CHANGED Viewed

@@ -1,37 +1,63 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
 from dataclasses import dataclass
-from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
-from swift.utils import seed_everything
-# TODO: Support custom model for swift infer
 @dataclass
 class SwiftInferArgs:
     model_id_or_path: str
     model_type: str
+    infer_backend: str = 'vllm'  # 可选 'pt', 'vllm', 'lmdeploy'
     max_new_tokens: int = 2048
+    temperature: float = 0.1
+    max_batch_size: int = 16
 class SwiftInfer:
     def __init__(self, args: SwiftInferArgs):
-        model_type = args.model_type
-        template_type = get_default_template_type(model_type)
-        model, tokenizer = get_model_tokenizer(
-            model_type, model_id_or_path=args.model_id_or_path, model_kwargs={'device_map': 'auto'})
-        model.generation_config.max_new_tokens = args.max_new_tokens
-        print(f'** Generation config: {model.generation_config}')
+        # infer backend模型初始化
+        if args.infer_backend == 'pt':
+            self.engine: InferEngine = PtEngine(args.model_id_or_path, max_batch_size=args.max_batch_size)
+        elif args.infer_backend == 'vllm':
+            from swift.llm import VllmEngine
+            self.engine: InferEngine = VllmEngine(args.model_id_or_path, max_model_len=8192)
+        elif args.infer_backend == 'lmdeploy':
+            from swift.llm import LmdeployEngine
+            self.engine: InferEngine = LmdeployEngine(args.model_id_or_path)
+        else:
+            raise ValueError(f'Unsupported infer_backend: {args.infer_backend}')
+        # 基本配置获取 （可选）
+        self.request_config = RequestConfig(
+            max_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            stream=False  # 可以透传参数改为True进行流式推理
+        )
-        template = get_template(template_type, tokenizer)
-        seed_everything(42)
+    def predict(self, system: str, query: str, history: list):
+        # Swift 3.0标准接口中，消息传入的格式是：
+        # messages: [{"role": "system", "content": "<SYSTEM_PROMPT>"},
+        #            {"role": "user", "content": "用户问题内容"},
+        #            {"role": "assistant", "content": "助手回答内容"}, ...]
-        self.tokenizer = tokenizer
-        self.model = model
-        self.template = template
+        messages = []
+        if system.strip():
+            messages.append({'role': 'system', 'content': system})
-    def predict(self, system: str, query: str, history: list):
+        # 将历史对话拼接进message中
+        for qa_pair in history:
+            # 假定 history 中每个元素形如 ("user input", "model response")，请根据你的数据格式进行调整。
+            user_answer, model_response = qa_pair
+            messages.append({'role': 'user', 'content': user_answer})
+            messages.append({'role': 'assistant', 'content': model_response})
+        # 添加本次用户问题
+        messages.append({'role': 'user', 'content': query})
+        infer_request = InferRequest(messages=messages)
+        # 进行推理
+        response = self.engine.infer([infer_request], self.request_config)
-        response, history = inference(self.model, self.template, query=query, system=system, history=history)
+        # 提取模型返回的文本结果（假设非stream模式）
+        result_text = response[0].choices[0].message.content.strip()
-        return response
+        return result_text

evalscope/third_party/toolbench_static/toolbench_static.py CHANGED Viewed

@@ -6,11 +6,12 @@ from typing import Union
 from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
 from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
 from evalscope.utils import get_logger
+from evalscope.utils.deprecation_utils import deprecated
 from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
 logger = get_logger()
+@deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
 def run_task(task_cfg: Union[str, dict]):
     if isinstance(task_cfg, str):

evalscope/utils/__init__.py CHANGED Viewed

@@ -1,4 +1,84 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.utils.model_utils import EvalBackend
-from evalscope.utils.utils import *
+from typing import TYPE_CHECKING
+from .import_utils import _LazyModule
+if TYPE_CHECKING:
+    from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
+    from .deprecation_utils import deprecated
+    from .function_utils import run_once, thread_safe
+    from .import_utils import get_module_path, is_module_installed
+    from .io_utils import (
+        OutputsStructure,
+        csv_to_jsonl,
+        csv_to_list,
+        dict_to_yaml,
+        gen_hash,
+        get_latest_folder_path,
+        get_valid_list,
+        json_to_dict,
+        jsonl_to_csv,
+        jsonl_to_list,
+        safe_filename,
+        yaml_to_dict,
+    )
+    from .logger import configure_logging, get_logger
+    from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
+else:
+    _import_structure = {
+        'argument_utils': [
+            'BaseArgument',
+            'parse_int_or_float',
+            'get_supported_params',
+        ],
+        'model_utils': [
+            'EvalBackend',
+            'get_device',
+            'seed_everything',
+            'dict_torch_dtype_to_str',
+            'fix_do_sample_warning',
+        ],
+        'import_utils': [
+            'is_module_installed',
+            'get_module_path',
+        ],
+        'function_utils': [
+            'thread_safe',
+            'run_once',
+        ],
+        'io_utils': [
+            'OutputsStructure',
+            'csv_to_list',
+            'json_to_dict',
+            'yaml_to_dict',
+            'get_latest_folder_path',
+            'gen_hash',
+            'dict_to_yaml',
+            'csv_to_jsonl',
+            'jsonl_to_csv',
+            'jsonl_to_list',
+            'gen_hash',
+            'get_valid_list',
+            'safe_filename',
+            'thread_safe',
+        ],
+        'deprecation_utils': [
+            'deprecated',
+        ],
+        'logger': [
+            'get_logger',
+            'configure_logging',
+        ],
+    }
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )

evalscope/utils/argument_utils.py ADDED Viewed

@@ -0,0 +1,64 @@
+import json
+from argparse import Namespace
+from inspect import signature
+from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
+class BaseArgument:
+    """
+    BaseArgument is a base class designed to facilitate the creation and manipulation
+    of argument classes in the evalscope framework. It provides utility methods for
+    instantiating objects from various data formats and converting objects back into
+    dictionary representations.
+    """
+    @classmethod
+    def from_dict(cls, d: dict):
+        """Instantiate the class from a dictionary."""
+        return cls(**d)
+    @classmethod
+    def from_json(cls, json_file: str):
+        """Instantiate the class from a JSON file."""
+        return cls.from_dict(json_to_dict(json_file))
+    @classmethod
+    def from_yaml(cls, yaml_file: str):
+        """Instantiate the class from a YAML file."""
+        return cls.from_dict(yaml_to_dict(yaml_file))
+    @classmethod
+    def from_args(cls, args: Namespace):
+        """
+        Instantiate the class from an argparse.Namespace object.
+        Filters out None values and removes 'func' if present.
+        """
+        args_dict = {k: v for k, v in vars(args).items() if v is not None}
+        if 'func' in args_dict:
+            del args_dict['func']  # Note: compat CLI arguments
+        return cls.from_dict(args_dict)
+    def to_dict(self):
+        """Convert the instance to a dictionary."""
+        result = self.__dict__.copy()
+        return result
+    def __str__(self):
+        """Return a JSON-formatted string representation of the instance."""
+        return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
+def parse_int_or_float(num):
+    number = float(num)
+    if number.is_integer():
+        return int(number)
+    return number
+def get_supported_params(func):
+    """Get the supported parameters of a function."""
+    sig = signature(func)
+    return set(sig.parameters.keys())

evalscope/utils/chat_service.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import time
-import torch
 from contextlib import contextmanager
 from functools import partial
 from pydantic import BaseModel, Field
@@ -32,6 +31,7 @@ class ModelList(BaseModel):
 class ChatMessage(BaseModel):
     role: Literal['user', 'assistant', 'system']
     content: str
+    reasoning_content: Optional[str] = None
 class DeltaMessage(BaseModel):
@@ -63,10 +63,10 @@ class ChatCompletionResponseStreamChoice(BaseModel):
 class ChatCompletionResponse(BaseModel):
     model: str
-    object: Literal['chat.completion', 'chat.completion.chunk']
+    object: Literal['chat.completion', 'chat.completion.chunk', 'images.generations']
     choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
     created: Optional[int] = Field(default_factory=lambda: int(time.time()))
-    usage: Optional[Usage]
+    usage: Optional[Usage] = None
 class TextCompletionRequest(BaseModel):
@@ -94,6 +94,7 @@ class TextCompletionResponse(BaseModel):
 class ChatService:
     def __init__(self, model_path, attn_implementation):
+        import torch
         from modelscope import AutoModelForCausalLM, AutoTokenizer
         from transformers import TextIteratorStreamer
@@ -174,7 +175,7 @@ class ChatService:
         )
     def _prepare_text_inputs(self, request: TextCompletionRequest):
-        inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=True).to(self.device)
+        inputs = self.tokenizer(request.prompt, return_tensors='pt', padding=False).to(self.device)
         prompt_tokens = len(inputs['input_ids'][0])
         return inputs, prompt_tokens
@@ -203,8 +204,9 @@ class ChatService:
     def _prepare_chat_inputs(self, request: ChatCompletionRequest):
         formatted_prompt = self.tokenizer.apply_chat_template(
-            request.messages, tokenize=False, add_generation_prompt=True)
-        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
+            request.messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
         prompt_tokens = len(inputs['input_ids'][0])
         return formatted_prompt, inputs, prompt_tokens

evalscope/utils/deprecation_utils.py ADDED Viewed

@@ -0,0 +1,53 @@
+import functools
+import inspect
+import os
+from typing import Callable, Optional
+from .logger import get_logger
+logger = get_logger()
+def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
+    """
+    Decorator to mark functions as deprecated.
+    :param since: String indicating the version since deprecation
+    :param remove_in: Optional string indicating the version when it will be removed
+    :param alternative: Optional string suggesting an alternative
+    :return: Decorated function
+    """
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            # Get the file name where the function is defined
+            file_name = os.path.basename(inspect.getfile(func))
+            # Construct the warning message
+            warning_parts = [
+                f'{func.__name__} in {file_name} has been deprecated since version {since}',
+                f'and will be removed in version {remove_in}' if remove_in else None,
+                f'Use {alternative} instead' if alternative else None
+            ]
+            warning_message = '. '.join(filter(None, warning_parts))
+            # Log the warning
+            logger.warning(warning_message)
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator
+def deprecated_warning(logger, message: str):
+    """
+    Log a deprecation warning.
+    :param logger: Logger instance to log the warning
+    :param message: Warning message to log
+    """
+    logger.warning(f'Deprecated: {message}')

evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl