evalscope 0.17.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.17.0 → evalscope-1.0.0}/PKG-INFO +120 -70
- evalscope-0.17.0/evalscope.egg-info/PKG-INFO → evalscope-1.0.0/README.md +114 -93
- evalscope-1.0.0/evalscope/__init__.py +8 -0
- evalscope-1.0.0/evalscope/api/benchmark/__init__.py +3 -0
- evalscope-1.0.0/evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope-1.0.0/evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope-1.0.0/evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope-1.0.0/evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope-1.0.0/evalscope/api/benchmark/benchmark.py +321 -0
- evalscope-1.0.0/evalscope/api/benchmark/meta.py +115 -0
- evalscope-1.0.0/evalscope/api/dataset/__init__.py +2 -0
- evalscope-1.0.0/evalscope/api/dataset/dataset.py +349 -0
- evalscope-1.0.0/evalscope/api/dataset/loader.py +261 -0
- evalscope-1.0.0/evalscope/api/dataset/utils.py +143 -0
- evalscope-1.0.0/evalscope/api/evaluator/__init__.py +3 -0
- evalscope-1.0.0/evalscope/api/evaluator/cache.py +355 -0
- evalscope-1.0.0/evalscope/api/evaluator/evaluator.py +56 -0
- evalscope-1.0.0/evalscope/api/evaluator/state.py +264 -0
- evalscope-1.0.0/evalscope/api/filter/__init__.py +1 -0
- evalscope-1.0.0/evalscope/api/filter/filter.py +72 -0
- evalscope-1.0.0/evalscope/api/messages/__init__.py +11 -0
- evalscope-1.0.0/evalscope/api/messages/chat_message.py +198 -0
- evalscope-1.0.0/evalscope/api/messages/content.py +102 -0
- evalscope-1.0.0/evalscope/api/messages/utils.py +35 -0
- evalscope-1.0.0/evalscope/api/metric/__init__.py +2 -0
- evalscope-1.0.0/evalscope/api/metric/metric.py +55 -0
- evalscope-1.0.0/evalscope/api/metric/scorer.py +105 -0
- evalscope-1.0.0/evalscope/api/mixin/__init__.py +2 -0
- evalscope-1.0.0/evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope-1.0.0/evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope-1.0.0/evalscope/api/model/__init__.py +12 -0
- evalscope-1.0.0/evalscope/api/model/generate_config.py +157 -0
- evalscope-1.0.0/evalscope/api/model/model.py +383 -0
- evalscope-1.0.0/evalscope/api/model/model_output.py +285 -0
- evalscope-1.0.0/evalscope/api/registry.py +182 -0
- evalscope-1.0.0/evalscope/api/tool/__init__.py +3 -0
- evalscope-1.0.0/evalscope/api/tool/tool_call.py +101 -0
- evalscope-1.0.0/evalscope/api/tool/tool_info.py +173 -0
- evalscope-1.0.0/evalscope/api/tool/utils.py +64 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/app_ui.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/multi_model.py +50 -25
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/single_model.py +23 -11
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/data_utils.py +42 -26
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/text_utils.py +0 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/visualization.py +9 -4
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/arguments.py +6 -7
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/api_meta_template.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/backend_manager.py +6 -3
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/embedding.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/llm.py +13 -12
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/__init__.py +0 -2
- evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py → evalscope-1.0.0/evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +1 -15
- evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +76 -0
- evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +53 -0
- evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +42 -0
- evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +47 -0
- evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +26 -0
- evalscope-1.0.0/evalscope/benchmarks/aime/aime24_adapter.py +50 -0
- evalscope-1.0.0/evalscope/benchmarks/aime/aime25_adapter.py +46 -0
- evalscope-1.0.0/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope-1.0.0/evalscope/benchmarks/arc/arc_adapter.py +46 -0
- evalscope-1.0.0/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +148 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope-1.0.0/evalscope/benchmarks/bbh/bbh_adapter.py +175 -0
- evalscope-1.0.0/evalscope/benchmarks/bfcl/bfcl_adapter.py +258 -0
- evalscope-1.0.0/evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope-1.0.0/evalscope/benchmarks/ceval/ceval_adapter.py +170 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope-1.0.0/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +122 -0
- evalscope-1.0.0/evalscope/benchmarks/competition_math/competition_math_adapter.py +73 -0
- evalscope-1.0.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +210 -0
- evalscope-1.0.0/evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/docmath/utils.py +4 -5
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope-1.0.0/evalscope/benchmarks/frames/frames_adapter.py +174 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope-1.0.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope-1.0.0/evalscope/benchmarks/general_qa/general_qa_adapter.py +94 -0
- evalscope-1.0.0/evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope-0.17.0/evalscope/benchmarks/gpqa/chain_of_thought.txt → evalscope-1.0.0/evalscope/benchmarks/gpqa/prompt.py +12 -5
- evalscope-1.0.0/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +76 -0
- evalscope-1.0.0/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +62 -0
- evalscope-1.0.0/evalscope/benchmarks/hle/hle_adapter.py +152 -0
- evalscope-1.0.0/evalscope/benchmarks/humaneval/humaneval_adapter.py +124 -0
- evalscope-1.0.0/evalscope/benchmarks/ifeval/ifeval_adapter.py +83 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/instructions.py +109 -64
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope-1.0.0/evalscope/benchmarks/iquiz/iquiz_adapter.py +35 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope-1.0.0/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +138 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope-1.0.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope-1.0.0/evalscope/benchmarks/math_500/math_500_adapter.py +51 -0
- evalscope-1.0.0/evalscope/benchmarks/mmlu/mmlu_adapter.py +107 -0
- evalscope-1.0.0/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +94 -0
- evalscope-1.0.0/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope-1.0.0/evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope-1.0.0/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +388 -0
- evalscope-1.0.0/evalscope/benchmarks/process_bench/process_bench_adapter.py +170 -0
- evalscope-1.0.0/evalscope/benchmarks/race/race_adapter.py +49 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope-0.17.0/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt → evalscope-1.0.0/evalscope/benchmarks/super_gpqa/prompt.py +14 -16
- evalscope-1.0.0/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope-1.0.0/evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope-1.0.0/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +168 -0
- evalscope-1.0.0/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope-1.0.0/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +74 -0
- evalscope-1.0.0/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +92 -0
- evalscope-1.0.0/evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/cli.py +2 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_server.py +6 -3
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/collections/__init__.py +2 -10
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/collections/sampler.py +10 -10
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/collections/schema.py +13 -11
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/config.py +95 -54
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/constants.py +34 -82
- evalscope-1.0.0/evalscope/evaluator/__init__.py +3 -0
- evalscope-1.0.0/evalscope/evaluator/evaluator.py +337 -0
- evalscope-1.0.0/evalscope/filters/__init__.py +2 -0
- evalscope-1.0.0/evalscope/filters/extraction.py +126 -0
- evalscope-1.0.0/evalscope/filters/selection.py +57 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/__init__.py +16 -14
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/llm_judge.py +37 -34
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/math_parser.py +27 -22
- evalscope-1.0.0/evalscope/metrics/metric.py +307 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/metrics.py +41 -25
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-1.0.0/evalscope/metrics/t2v_metrics}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope-1.0.0/evalscope/models/__init__.py +26 -0
- evalscope-1.0.0/evalscope/models/mockllm.py +65 -0
- evalscope-1.0.0/evalscope/models/model_apis.py +47 -0
- evalscope-1.0.0/evalscope/models/modelscope.py +455 -0
- evalscope-1.0.0/evalscope/models/openai_compatible.py +123 -0
- evalscope-1.0.0/evalscope/models/text2image_model.py +124 -0
- evalscope-1.0.0/evalscope/models/utils/openai.py +698 -0
- evalscope-1.0.0/evalscope/perf/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/arguments.py +13 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/benchmark.py +39 -39
- evalscope-1.0.0/evalscope/perf/http_client.py +122 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/main.py +2 -2
- evalscope-1.0.0/evalscope/perf/plugin/__init__.py +3 -0
- evalscope-1.0.0/evalscope/perf/plugin/api/__init__.py +4 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/api/base.py +22 -4
- evalscope-1.0.0/evalscope/perf/plugin/api/custom_api.py +250 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope-1.0.0/evalscope/perf/plugin/api/default_api.py +105 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/api/openai_api.py +28 -28
- evalscope-1.0.0/evalscope/perf/plugin/datasets/__init__.py +10 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/base.py +22 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/custom.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope-1.0.0/evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/line_by_line.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/longalpaca.py +4 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/openqa.py +6 -3
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope-1.0.0/evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope-1.0.0/evalscope/perf/plugin/registry.py +74 -0
- evalscope-1.0.0/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/benchmark_util.py +18 -22
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/db_util.py +81 -60
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/local_server.py +8 -3
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/rich_display.py +16 -10
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/report/__init__.py +2 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/report/combinator.py +18 -12
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/report/generator.py +101 -6
- evalscope-0.17.0/evalscope/report/utils.py → evalscope-1.0.0/evalscope/report/report.py +8 -6
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/run.py +26 -44
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/summarizer.py +1 -1
- evalscope-1.0.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/__init__.py +21 -2
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/chat_service.py +2 -1
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/deprecation_utils.py +12 -1
- evalscope-1.0.0/evalscope/utils/function_utils.py +29 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/io_utils.py +110 -5
- evalscope-1.0.0/evalscope/utils/json_schema.py +208 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/logger.py +51 -12
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/model_utils.py +10 -7
- evalscope-1.0.0/evalscope/utils/multi_choices.py +271 -0
- evalscope-1.0.0/evalscope/utils/url_utils.py +65 -0
- evalscope-1.0.0/evalscope/version.py +4 -0
- evalscope-0.17.0/README.md → evalscope-1.0.0/evalscope.egg-info/PKG-INFO +143 -66
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/SOURCES.txt +67 -42
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/requires.txt +30 -9
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/aigc.txt +1 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/app.txt +1 -1
- evalscope-1.0.0/requirements/dev.txt +5 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/framework.txt +7 -4
- {evalscope-0.17.0 → evalscope-1.0.0}/setup.cfg +15 -6
- {evalscope-0.17.0 → evalscope-1.0.0}/setup.py +33 -15
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/aigc/test_t2i.py +22 -4
- evalscope-1.0.0/tests/benchmark/test_eval.py +386 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/cli/test_all.py +21 -7
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/cli/test_collection.py +13 -4
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/cli/test_custom.py +22 -15
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/perf/test_perf.py +29 -2
- evalscope-1.0.0/tests/rag/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/rag/test_clip_benchmark.py +1 -0
- evalscope-1.0.0/tests/vlm/__init__.py +1 -0
- evalscope-0.17.0/evalscope/__init__.py +0 -5
- evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope-0.17.0/evalscope/benchmarks/aime/aime24_adapter.py +0 -52
- evalscope-0.17.0/evalscope/benchmarks/aime/aime25_adapter.py +0 -52
- evalscope-0.17.0/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -107
- evalscope-0.17.0/evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope-0.17.0/evalscope/benchmarks/arc/arc_adapter.py +0 -159
- evalscope-0.17.0/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -122
- evalscope-0.17.0/evalscope/benchmarks/bbh/bbh_adapter.py +0 -247
- evalscope-0.17.0/evalscope/benchmarks/benchmark.py +0 -81
- evalscope-0.17.0/evalscope/benchmarks/bfcl/bfcl_adapter.py +0 -237
- evalscope-0.17.0/evalscope/benchmarks/ceval/ceval_adapter.py +0 -238
- evalscope-0.17.0/evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope-0.17.0/evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope-0.17.0/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -213
- evalscope-0.17.0/evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope-0.17.0/evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope-0.17.0/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -125
- evalscope-0.17.0/evalscope/benchmarks/data_adapter.py +0 -523
- evalscope-0.17.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -72
- evalscope-0.17.0/evalscope/benchmarks/docmath/docmath_adapter.py +0 -85
- evalscope-0.17.0/evalscope/benchmarks/filters.py +0 -59
- evalscope-0.17.0/evalscope/benchmarks/frames/frames_adapter.py +0 -91
- evalscope-0.17.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -118
- evalscope-0.17.0/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -154
- evalscope-0.17.0/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -129
- evalscope-0.17.0/evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope-0.17.0/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -156
- evalscope-0.17.0/evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope-0.17.0/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -151
- evalscope-0.17.0/evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope-0.17.0/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -109
- evalscope-0.17.0/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -54
- evalscope-0.17.0/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -70
- evalscope-0.17.0/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -88
- evalscope-0.17.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -82
- evalscope-0.17.0/evalscope/benchmarks/math_500/math_500_adapter.py +0 -58
- evalscope-0.17.0/evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope-0.17.0/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -280
- evalscope-0.17.0/evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope-0.17.0/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -113
- evalscope-0.17.0/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -185
- evalscope-0.17.0/evalscope/benchmarks/musr/musr_adapter.py +0 -74
- evalscope-0.17.0/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +0 -348
- evalscope-0.17.0/evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope-0.17.0/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -102
- evalscope-0.17.0/evalscope/benchmarks/race/race.py +0 -104
- evalscope-0.17.0/evalscope/benchmarks/race/race_adapter.py +0 -135
- evalscope-0.17.0/evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope-0.17.0/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -209
- evalscope-0.17.0/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope-0.17.0/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +0 -75
- evalscope-0.17.0/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope-0.17.0/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -142
- evalscope-0.17.0/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope-0.17.0/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -287
- evalscope-0.17.0/evalscope/benchmarks/utils.py +0 -59
- evalscope-0.17.0/evalscope/benchmarks/winogrande/winogrande_adapter.py +0 -60
- evalscope-0.17.0/evalscope/collections/evaluator.py +0 -375
- evalscope-0.17.0/evalscope/evaluator/__init__.py +0 -3
- evalscope-0.17.0/evalscope/evaluator/evaluator.py +0 -481
- evalscope-0.17.0/evalscope/metrics/completion_parsers.py +0 -220
- evalscope-0.17.0/evalscope/metrics/named_metrics.py +0 -55
- evalscope-0.17.0/evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope-0.17.0/evalscope/models/__init__.py +0 -53
- evalscope-0.17.0/evalscope/models/adapters/__init__.py +0 -19
- evalscope-0.17.0/evalscope/models/adapters/base_adapter.py +0 -80
- evalscope-0.17.0/evalscope/models/adapters/bfcl_adapter.py +0 -244
- evalscope-0.17.0/evalscope/models/adapters/chat_adapter.py +0 -204
- evalscope-0.17.0/evalscope/models/adapters/choice_adapter.py +0 -218
- evalscope-0.17.0/evalscope/models/adapters/custom_adapter.py +0 -67
- evalscope-0.17.0/evalscope/models/adapters/server_adapter.py +0 -234
- evalscope-0.17.0/evalscope/models/adapters/t2i_adapter.py +0 -76
- evalscope-0.17.0/evalscope/models/custom/__init__.py +0 -4
- evalscope-0.17.0/evalscope/models/custom/custom_model.py +0 -50
- evalscope-0.17.0/evalscope/models/custom/dummy_model.py +0 -99
- evalscope-0.17.0/evalscope/models/local_model.py +0 -128
- evalscope-0.17.0/evalscope/models/model.py +0 -189
- evalscope-0.17.0/evalscope/models/register.py +0 -55
- evalscope-0.17.0/evalscope/perf/http_client.py +0 -176
- evalscope-0.17.0/evalscope/perf/plugin/__init__.py +0 -2
- evalscope-0.17.0/evalscope/perf/plugin/api/__init__.py +0 -3
- evalscope-0.17.0/evalscope/perf/plugin/api/custom_api.py +0 -92
- evalscope-0.17.0/evalscope/perf/plugin/datasets/__init__.py +0 -7
- evalscope-0.17.0/evalscope/perf/plugin/registry.py +0 -54
- evalscope-0.17.0/evalscope/version.py +0 -4
- evalscope-0.17.0/requirements/dev.txt +0 -5
- evalscope-0.17.0/tests/cli/test_run.py +0 -501
- {evalscope-0.17.0 → evalscope-1.0.0}/LICENSE +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/MANIFEST.in +0 -0
- {evalscope-0.17.0/evalscope/backend → evalscope-1.0.0/evalscope/api}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/app.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/constants.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/sidebar.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/visualization.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/localization.py +0 -0
- {evalscope-0.17.0/evalscope/backend/rag_eval/clip_benchmark/tasks → evalscope-1.0.0/evalscope/backend}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/base.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.17.0/evalscope/backend/rag_eval/utils → evalscope-1.0.0/evalscope/backend/rag_eval/clip_benchmark/tasks}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/aigc → evalscope-1.0.0/evalscope/backend/rag_eval/utils}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/aigc/t2i → evalscope-1.0.0/evalscope/benchmarks/aigc}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/aime → evalscope-1.0.0/evalscope/benchmarks/aigc/i2i}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/alpaca_eval → evalscope-1.0.0/evalscope/benchmarks/aigc/t2i}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/arena_hard → evalscope-1.0.0/evalscope/benchmarks/aime}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/bfcl → evalscope-1.0.0/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/chinese_simple_qa → evalscope-1.0.0/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/data_collection → evalscope-1.0.0/evalscope/benchmarks/bfcl}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/docmath → evalscope-1.0.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/drop → evalscope-1.0.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/frames → evalscope-1.0.0/evalscope/benchmarks/docmath}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/general_arena → evalscope-1.0.0/evalscope/benchmarks/drop}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/drop/utils.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/general_mcq → evalscope-1.0.0/evalscope/benchmarks/frames}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/frames/utils.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/gpqa → evalscope-1.0.0/evalscope/benchmarks/general_arena}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/ifeval → evalscope-1.0.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/iquiz → evalscope-1.0.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/live_code_bench → evalscope-1.0.0/evalscope/benchmarks/hle}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/maritime_bench → evalscope-1.0.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/math_500 → evalscope-1.0.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/mmlu_pro → evalscope-1.0.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/mmlu_redux → evalscope-1.0.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/musr → evalscope-1.0.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/needle_haystack → evalscope-1.0.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/process_bench → evalscope-1.0.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/simple_qa → evalscope-1.0.0/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/super_gpqa → evalscope-1.0.0/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/tool_bench → evalscope-1.0.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/winogrande → evalscope-1.0.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models → evalscope-1.0.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-1.0.0/evalscope/benchmarks/tau_bench}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-1.0.0/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/tool_bench/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-1.0.0/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/base.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.17.0/evalscope/perf → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/perf/utils → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/third_party/thinkbench/tools → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.17.0/tests/rag → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/log_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/argument_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/import_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/docs.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/opencompass.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/perf.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/rag.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements/vlmeval.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/requirements.txt +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/aigc/__init__.py +0 -0
- {evalscope-0.17.0/tests/cli → evalscope-1.0.0/tests/benchmark}/__init__.py +0 -0
- {evalscope-0.17.0/tests/perf → evalscope-1.0.0/tests/cli}/__init__.py +0 -0
- {evalscope-0.17.0/tests/swift → evalscope-1.0.0/tests/perf}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.17.0/tests/vlm → evalscope-1.0.0/tests/swift}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/test_run_all.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-1.0.0}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
7
7
|
Author-email: contact@modelscope.cn
|
|
8
|
+
License: Apache License 2.0
|
|
8
9
|
Keywords: python,llm,evaluation
|
|
9
10
|
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/markdown
|
|
18
19
|
Provides-Extra: opencompass
|
|
19
20
|
Provides-Extra: vlmeval
|
|
@@ -22,6 +23,7 @@ Provides-Extra: perf
|
|
|
22
23
|
Provides-Extra: app
|
|
23
24
|
Provides-Extra: aigc
|
|
24
25
|
Provides-Extra: dev
|
|
26
|
+
Provides-Extra: docs
|
|
25
27
|
Provides-Extra: all
|
|
26
28
|
License-File: LICENSE
|
|
27
29
|
|
|
@@ -55,25 +57,26 @@ License-File: LICENSE
|
|
|
55
57
|
- [📝 Introduction](#-introduction)
|
|
56
58
|
- [☎ User Groups](#-user-groups)
|
|
57
59
|
- [🎉 News](#-news)
|
|
58
|
-
- [🛠️
|
|
59
|
-
- [Method 1
|
|
60
|
-
- [Method 2
|
|
60
|
+
- [🛠️ Environment Setup](#️-environment-setup)
|
|
61
|
+
- [Method 1. Install via pip](#method-1-install-via-pip)
|
|
62
|
+
- [Method 2. Install from source](#method-2-install-from-source)
|
|
61
63
|
- [🚀 Quick Start](#-quick-start)
|
|
62
64
|
- [Method 1. Using Command Line](#method-1-using-command-line)
|
|
63
65
|
- [Method 2. Using Python Code](#method-2-using-python-code)
|
|
64
66
|
- [Basic Parameter](#basic-parameter)
|
|
65
67
|
- [Output Results](#output-results)
|
|
66
68
|
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
67
|
-
- [🌐 Evaluation of
|
|
69
|
+
- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
|
|
68
70
|
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
69
|
-
- [Parameter](#parameter)
|
|
70
|
-
- [Evaluation
|
|
71
|
+
- [Parameter Description](#parameter-description)
|
|
72
|
+
- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
|
|
71
73
|
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
72
74
|
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
73
|
-
- [
|
|
75
|
+
- [⚔️ Arena Mode](#️-arena-mode)
|
|
74
76
|
- [👷♂️ Contribution](#️-contribution)
|
|
77
|
+
- [📚 Citation](#-citation)
|
|
75
78
|
- [🔜 Roadmap](#-roadmap)
|
|
76
|
-
- [Star History](
|
|
79
|
+
- [⭐ Star History](#-star-history)
|
|
77
80
|
|
|
78
81
|
|
|
79
82
|
## 📝 Introduction
|
|
@@ -138,6 +141,15 @@ Please scan the QR code below to join our community groups:
|
|
|
138
141
|
|
|
139
142
|
## 🎉 News
|
|
140
143
|
|
|
144
|
+
> [!IMPORTANT]
|
|
145
|
+
> **Version 1.0 Refactoring**
|
|
146
|
+
>
|
|
147
|
+
> Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
|
|
148
|
+
|
|
149
|
+
- 🔥 **[2025.08.22]** Version 1.0 Refactoring.
|
|
150
|
+
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
151
|
+
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
152
|
+
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
141
153
|
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
142
154
|
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
143
155
|
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
@@ -145,6 +157,8 @@ Please scan the QR code below to join our community groups:
|
|
|
145
157
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
146
158
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
147
159
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
160
|
+
<details><summary>More</summary>
|
|
161
|
+
|
|
148
162
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
149
163
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
150
164
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -158,8 +172,6 @@ Please scan the QR code below to join our community groups:
|
|
|
158
172
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
159
173
|
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
160
174
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
161
|
-
<details><summary>More</summary>
|
|
162
|
-
|
|
163
175
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
164
176
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
165
177
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
@@ -183,58 +195,87 @@ Please scan the QR code below to join our community groups:
|
|
|
183
195
|
|
|
184
196
|
</details>
|
|
185
197
|
|
|
186
|
-
## 🛠️
|
|
187
|
-
|
|
188
|
-
|
|
198
|
+
## 🛠️ Environment Setup
|
|
199
|
+
|
|
200
|
+
### Method 1. Install via pip
|
|
201
|
+
|
|
202
|
+
We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
|
|
189
203
|
|
|
190
204
|
1. Create a conda environment (optional)
|
|
205
|
+
```shell
|
|
206
|
+
# Python 3.10 is recommended
|
|
207
|
+
conda create -n evalscope python=3.10
|
|
208
|
+
|
|
209
|
+
# Activate the conda environment
|
|
210
|
+
conda activate evalscope
|
|
211
|
+
```
|
|
212
|
+
2. Install dependencies via pip
|
|
213
|
+
```shell
|
|
214
|
+
pip install evalscope
|
|
215
|
+
```
|
|
216
|
+
3. Install additional dependencies (optional)
|
|
217
|
+
- To use model service inference benchmarking features, install the perf dependency:
|
|
191
218
|
```shell
|
|
192
|
-
|
|
193
|
-
conda create -n evalscope python=3.10
|
|
194
|
-
# Activate the conda environment
|
|
195
|
-
conda activate evalscope
|
|
219
|
+
pip install 'evalscope[perf]'
|
|
196
220
|
```
|
|
197
|
-
|
|
198
|
-
|
|
221
|
+
- To use visualization features, install the app dependency:
|
|
222
|
+
```shell
|
|
223
|
+
pip install 'evalscope[app]'
|
|
224
|
+
```
|
|
225
|
+
- If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
|
|
199
226
|
```shell
|
|
200
|
-
pip install evalscope
|
|
201
|
-
|
|
202
|
-
pip install 'evalscope[
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
pip install 'evalscope[
|
|
207
|
-
pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
227
|
+
pip install 'evalscope[opencompass]'
|
|
228
|
+
pip install 'evalscope[vlmeval]'
|
|
229
|
+
pip install 'evalscope[rag]'
|
|
230
|
+
```
|
|
231
|
+
- To install all dependencies:
|
|
232
|
+
```shell
|
|
233
|
+
pip install 'evalscope[all]'
|
|
208
234
|
```
|
|
209
235
|
|
|
210
|
-
> [!
|
|
211
|
-
>
|
|
236
|
+
> [!NOTE]
|
|
237
|
+
> The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
|
|
212
238
|
> ```shell
|
|
213
|
-
>
|
|
239
|
+
> pip install llmuses<=0.4.3
|
|
214
240
|
> ```
|
|
215
|
-
>
|
|
216
|
-
> ```
|
|
241
|
+
> Then, import related dependencies using `llmuses`:
|
|
242
|
+
> ```python
|
|
217
243
|
> from llmuses import ...
|
|
218
244
|
> ```
|
|
219
245
|
|
|
220
|
-
### Method 2
|
|
221
|
-
1. Download the source code
|
|
222
|
-
```shell
|
|
223
|
-
git clone https://github.com/modelscope/evalscope.git
|
|
224
|
-
```
|
|
246
|
+
### Method 2. Install from source
|
|
225
247
|
|
|
248
|
+
Installing from source allows you to use the latest code and makes it easier for further development and debugging.
|
|
249
|
+
|
|
250
|
+
1. Clone the source code
|
|
251
|
+
```shell
|
|
252
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
253
|
+
```
|
|
226
254
|
2. Install dependencies
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
255
|
+
```shell
|
|
256
|
+
cd evalscope/
|
|
257
|
+
|
|
258
|
+
pip install -e .
|
|
259
|
+
```
|
|
260
|
+
3. Install additional dependencies
|
|
261
|
+
- To use model service inference benchmarking features, install the perf dependency:
|
|
262
|
+
```shell
|
|
263
|
+
pip install '.[perf]'
|
|
264
|
+
```
|
|
265
|
+
- To use visualization features, install the app dependency:
|
|
266
|
+
```shell
|
|
267
|
+
pip install '.[app]'
|
|
268
|
+
```
|
|
269
|
+
- If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
|
|
270
|
+
```shell
|
|
271
|
+
pip install '.[opencompass]'
|
|
272
|
+
pip install '.[vlmeval]'
|
|
273
|
+
pip install '.[rag]'
|
|
274
|
+
```
|
|
275
|
+
- To install all dependencies:
|
|
276
|
+
```shell
|
|
277
|
+
pip install '.[all]'
|
|
278
|
+
```
|
|
238
279
|
|
|
239
280
|
|
|
240
281
|
## 🚀 Quick Start
|
|
@@ -255,33 +296,31 @@ evalscope eval \
|
|
|
255
296
|
|
|
256
297
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
257
298
|
|
|
258
|
-
**Using
|
|
299
|
+
**Using `TaskConfig`**
|
|
259
300
|
|
|
260
301
|
```python
|
|
261
|
-
from evalscope
|
|
302
|
+
from evalscope import run_task, TaskConfig
|
|
262
303
|
|
|
263
|
-
task_cfg =
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
304
|
+
task_cfg = TaskConfig(
|
|
305
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
306
|
+
datasets=['gsm8k', 'arc'],
|
|
307
|
+
limit=5
|
|
308
|
+
)
|
|
268
309
|
|
|
269
310
|
run_task(task_cfg=task_cfg)
|
|
270
311
|
```
|
|
271
|
-
|
|
272
312
|
<details><summary>More Startup Methods</summary>
|
|
273
313
|
|
|
274
|
-
**Using
|
|
314
|
+
**Using Python Dictionary**
|
|
275
315
|
|
|
276
316
|
```python
|
|
277
317
|
from evalscope.run import run_task
|
|
278
|
-
from evalscope.config import TaskConfig
|
|
279
318
|
|
|
280
|
-
task_cfg =
|
|
281
|
-
model
|
|
282
|
-
datasets
|
|
283
|
-
limit
|
|
284
|
-
|
|
319
|
+
task_cfg = {
|
|
320
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
321
|
+
'datasets': ['gsm8k', 'arc'],
|
|
322
|
+
'limit': 5
|
|
323
|
+
}
|
|
285
324
|
|
|
286
325
|
run_task(task_cfg=task_cfg)
|
|
287
326
|
```
|
|
@@ -384,7 +423,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
384
423
|
|
|
385
424
|
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
386
425
|
|
|
387
|
-
## 🌐 Evaluation of
|
|
426
|
+
## 🌐 Evaluation of Model API
|
|
388
427
|
|
|
389
428
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
390
429
|
|
|
@@ -435,7 +474,7 @@ evalscope eval \
|
|
|
435
474
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
436
475
|
|
|
437
476
|
|
|
438
|
-
## Evaluation
|
|
477
|
+
## 🧪 Other Evaluation Backends
|
|
439
478
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
440
479
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
441
480
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -508,6 +547,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
508
547
|
</table>
|
|
509
548
|
</a>
|
|
510
549
|
|
|
550
|
+
## 📚 Citation
|
|
551
|
+
|
|
552
|
+
```bibtex
|
|
553
|
+
@misc{evalscope_2024,
|
|
554
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
555
|
+
author={ModelScope Team},
|
|
556
|
+
year={2024},
|
|
557
|
+
url={https://github.com/modelscope/evalscope}
|
|
558
|
+
}
|
|
559
|
+
```
|
|
560
|
+
|
|
511
561
|
## 🔜 Roadmap
|
|
512
562
|
- [x] Support for better evaluation report visualization
|
|
513
563
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -523,6 +573,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
523
573
|
- [x] MBPP
|
|
524
574
|
|
|
525
575
|
|
|
526
|
-
## Star History
|
|
576
|
+
## ⭐ Star History
|
|
527
577
|
|
|
528
578
|
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -1,30 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: evalscope
|
|
3
|
-
Version: 0.17.0
|
|
4
|
-
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
|
-
Home-page: https://github.com/modelscope/evalscope
|
|
6
|
-
Author: ModelScope team
|
|
7
|
-
Author-email: contact@modelscope.cn
|
|
8
|
-
Keywords: python,llm,evaluation
|
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
Requires-Python: >=3.8
|
|
17
|
-
Description-Content-Type: text/markdown
|
|
18
|
-
Provides-Extra: opencompass
|
|
19
|
-
Provides-Extra: vlmeval
|
|
20
|
-
Provides-Extra: rag
|
|
21
|
-
Provides-Extra: perf
|
|
22
|
-
Provides-Extra: app
|
|
23
|
-
Provides-Extra: aigc
|
|
24
|
-
Provides-Extra: dev
|
|
25
|
-
Provides-Extra: all
|
|
26
|
-
License-File: LICENSE
|
|
27
|
-
|
|
28
1
|
<p align="center">
|
|
29
2
|
<br>
|
|
30
3
|
<img src="docs/en/_static/images/evalscope_logo.png"/>
|
|
@@ -55,25 +28,26 @@ License-File: LICENSE
|
|
|
55
28
|
- [📝 Introduction](#-introduction)
|
|
56
29
|
- [☎ User Groups](#-user-groups)
|
|
57
30
|
- [🎉 News](#-news)
|
|
58
|
-
- [🛠️
|
|
59
|
-
- [Method 1
|
|
60
|
-
- [Method 2
|
|
31
|
+
- [🛠️ Environment Setup](#️-environment-setup)
|
|
32
|
+
- [Method 1. Install via pip](#method-1-install-via-pip)
|
|
33
|
+
- [Method 2. Install from source](#method-2-install-from-source)
|
|
61
34
|
- [🚀 Quick Start](#-quick-start)
|
|
62
35
|
- [Method 1. Using Command Line](#method-1-using-command-line)
|
|
63
36
|
- [Method 2. Using Python Code](#method-2-using-python-code)
|
|
64
37
|
- [Basic Parameter](#basic-parameter)
|
|
65
38
|
- [Output Results](#output-results)
|
|
66
39
|
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
67
|
-
- [🌐 Evaluation of
|
|
40
|
+
- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
|
|
68
41
|
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
69
|
-
- [Parameter](#parameter)
|
|
70
|
-
- [Evaluation
|
|
42
|
+
- [Parameter Description](#parameter-description)
|
|
43
|
+
- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
|
|
71
44
|
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
72
45
|
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
73
|
-
- [
|
|
46
|
+
- [⚔️ Arena Mode](#️-arena-mode)
|
|
74
47
|
- [👷♂️ Contribution](#️-contribution)
|
|
48
|
+
- [📚 Citation](#-citation)
|
|
75
49
|
- [🔜 Roadmap](#-roadmap)
|
|
76
|
-
- [Star History](
|
|
50
|
+
- [⭐ Star History](#-star-history)
|
|
77
51
|
|
|
78
52
|
|
|
79
53
|
## 📝 Introduction
|
|
@@ -138,6 +112,15 @@ Please scan the QR code below to join our community groups:
|
|
|
138
112
|
|
|
139
113
|
## 🎉 News
|
|
140
114
|
|
|
115
|
+
> [!IMPORTANT]
|
|
116
|
+
> **Version 1.0 Refactoring**
|
|
117
|
+
>
|
|
118
|
+
> Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
|
|
119
|
+
|
|
120
|
+
- 🔥 **[2025.08.22]** Version 1.0 Refactoring.
|
|
121
|
+
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
122
|
+
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
123
|
+
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
141
124
|
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
142
125
|
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
143
126
|
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
@@ -145,6 +128,8 @@ Please scan the QR code below to join our community groups:
|
|
|
145
128
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
146
129
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
147
130
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
131
|
+
<details><summary>More</summary>
|
|
132
|
+
|
|
148
133
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
149
134
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
150
135
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -158,8 +143,6 @@ Please scan the QR code below to join our community groups:
|
|
|
158
143
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
159
144
|
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
160
145
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
161
|
-
<details><summary>More</summary>
|
|
162
|
-
|
|
163
146
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
164
147
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
165
148
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
@@ -183,58 +166,87 @@ Please scan the QR code below to join our community groups:
|
|
|
183
166
|
|
|
184
167
|
</details>
|
|
185
168
|
|
|
186
|
-
## 🛠️
|
|
187
|
-
|
|
188
|
-
|
|
169
|
+
## 🛠️ Environment Setup
|
|
170
|
+
|
|
171
|
+
### Method 1. Install via pip
|
|
172
|
+
|
|
173
|
+
We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
|
|
189
174
|
|
|
190
175
|
1. Create a conda environment (optional)
|
|
176
|
+
```shell
|
|
177
|
+
# Python 3.10 is recommended
|
|
178
|
+
conda create -n evalscope python=3.10
|
|
179
|
+
|
|
180
|
+
# Activate the conda environment
|
|
181
|
+
conda activate evalscope
|
|
182
|
+
```
|
|
183
|
+
2. Install dependencies via pip
|
|
184
|
+
```shell
|
|
185
|
+
pip install evalscope
|
|
186
|
+
```
|
|
187
|
+
3. Install additional dependencies (optional)
|
|
188
|
+
- To use model service inference benchmarking features, install the perf dependency:
|
|
191
189
|
```shell
|
|
192
|
-
|
|
193
|
-
conda create -n evalscope python=3.10
|
|
194
|
-
# Activate the conda environment
|
|
195
|
-
conda activate evalscope
|
|
190
|
+
pip install 'evalscope[perf]'
|
|
196
191
|
```
|
|
197
|
-
|
|
198
|
-
2. Install dependencies using pip
|
|
192
|
+
- To use visualization features, install the app dependency:
|
|
199
193
|
```shell
|
|
200
|
-
pip install evalscope
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
pip install 'evalscope[
|
|
205
|
-
pip install 'evalscope[
|
|
206
|
-
pip install 'evalscope[
|
|
207
|
-
|
|
194
|
+
pip install 'evalscope[app]'
|
|
195
|
+
```
|
|
196
|
+
- If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
|
|
197
|
+
```shell
|
|
198
|
+
pip install 'evalscope[opencompass]'
|
|
199
|
+
pip install 'evalscope[vlmeval]'
|
|
200
|
+
pip install 'evalscope[rag]'
|
|
201
|
+
```
|
|
202
|
+
- To install all dependencies:
|
|
203
|
+
```shell
|
|
204
|
+
pip install 'evalscope[all]'
|
|
208
205
|
```
|
|
209
206
|
|
|
210
|
-
> [!
|
|
211
|
-
>
|
|
207
|
+
> [!NOTE]
|
|
208
|
+
> The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
|
|
212
209
|
> ```shell
|
|
213
|
-
>
|
|
210
|
+
> pip install llmuses<=0.4.3
|
|
214
211
|
> ```
|
|
215
|
-
>
|
|
216
|
-
> ```
|
|
212
|
+
> Then, import related dependencies using `llmuses`:
|
|
213
|
+
> ```python
|
|
217
214
|
> from llmuses import ...
|
|
218
215
|
> ```
|
|
219
216
|
|
|
220
|
-
### Method 2
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
git clone https://github.com/modelscope/evalscope.git
|
|
224
|
-
```
|
|
217
|
+
### Method 2. Install from source
|
|
218
|
+
|
|
219
|
+
Installing from source allows you to use the latest code and makes it easier for further development and debugging.
|
|
225
220
|
|
|
221
|
+
1. Clone the source code
|
|
222
|
+
```shell
|
|
223
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
224
|
+
```
|
|
226
225
|
2. Install dependencies
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
226
|
+
```shell
|
|
227
|
+
cd evalscope/
|
|
228
|
+
|
|
229
|
+
pip install -e .
|
|
230
|
+
```
|
|
231
|
+
3. Install additional dependencies
|
|
232
|
+
- To use model service inference benchmarking features, install the perf dependency:
|
|
233
|
+
```shell
|
|
234
|
+
pip install '.[perf]'
|
|
235
|
+
```
|
|
236
|
+
- To use visualization features, install the app dependency:
|
|
237
|
+
```shell
|
|
238
|
+
pip install '.[app]'
|
|
239
|
+
```
|
|
240
|
+
- If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
|
|
241
|
+
```shell
|
|
242
|
+
pip install '.[opencompass]'
|
|
243
|
+
pip install '.[vlmeval]'
|
|
244
|
+
pip install '.[rag]'
|
|
245
|
+
```
|
|
246
|
+
- To install all dependencies:
|
|
247
|
+
```shell
|
|
248
|
+
pip install '.[all]'
|
|
249
|
+
```
|
|
238
250
|
|
|
239
251
|
|
|
240
252
|
## 🚀 Quick Start
|
|
@@ -255,33 +267,31 @@ evalscope eval \
|
|
|
255
267
|
|
|
256
268
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
257
269
|
|
|
258
|
-
**Using
|
|
270
|
+
**Using `TaskConfig`**
|
|
259
271
|
|
|
260
272
|
```python
|
|
261
|
-
from evalscope
|
|
273
|
+
from evalscope import run_task, TaskConfig
|
|
262
274
|
|
|
263
|
-
task_cfg =
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
275
|
+
task_cfg = TaskConfig(
|
|
276
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
277
|
+
datasets=['gsm8k', 'arc'],
|
|
278
|
+
limit=5
|
|
279
|
+
)
|
|
268
280
|
|
|
269
281
|
run_task(task_cfg=task_cfg)
|
|
270
282
|
```
|
|
271
|
-
|
|
272
283
|
<details><summary>More Startup Methods</summary>
|
|
273
284
|
|
|
274
|
-
**Using
|
|
285
|
+
**Using Python Dictionary**
|
|
275
286
|
|
|
276
287
|
```python
|
|
277
288
|
from evalscope.run import run_task
|
|
278
|
-
from evalscope.config import TaskConfig
|
|
279
289
|
|
|
280
|
-
task_cfg =
|
|
281
|
-
model
|
|
282
|
-
datasets
|
|
283
|
-
limit
|
|
284
|
-
|
|
290
|
+
task_cfg = {
|
|
291
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
292
|
+
'datasets': ['gsm8k', 'arc'],
|
|
293
|
+
'limit': 5
|
|
294
|
+
}
|
|
285
295
|
|
|
286
296
|
run_task(task_cfg=task_cfg)
|
|
287
297
|
```
|
|
@@ -384,7 +394,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
384
394
|
|
|
385
395
|
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
386
396
|
|
|
387
|
-
## 🌐 Evaluation of
|
|
397
|
+
## 🌐 Evaluation of Model API
|
|
388
398
|
|
|
389
399
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
390
400
|
|
|
@@ -435,7 +445,7 @@ evalscope eval \
|
|
|
435
445
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
436
446
|
|
|
437
447
|
|
|
438
|
-
## Evaluation
|
|
448
|
+
## 🧪 Other Evaluation Backends
|
|
439
449
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
440
450
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
441
451
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -508,6 +518,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
508
518
|
</table>
|
|
509
519
|
</a>
|
|
510
520
|
|
|
521
|
+
## 📚 Citation
|
|
522
|
+
|
|
523
|
+
```bibtex
|
|
524
|
+
@misc{evalscope_2024,
|
|
525
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
526
|
+
author={ModelScope Team},
|
|
527
|
+
year={2024},
|
|
528
|
+
url={https://github.com/modelscope/evalscope}
|
|
529
|
+
}
|
|
530
|
+
```
|
|
531
|
+
|
|
511
532
|
## 🔜 Roadmap
|
|
512
533
|
- [x] Support for better evaluation report visualization
|
|
513
534
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -523,6 +544,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
523
544
|
- [x] MBPP
|
|
524
545
|
|
|
525
546
|
|
|
526
|
-
## Star History
|
|
547
|
+
## ⭐ Star History
|
|
527
548
|
|
|
528
549
|
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from evalscope.benchmarks import * # registered benchmarks
|
|
3
|
+
from evalscope.config import TaskConfig
|
|
4
|
+
from evalscope.filters import extraction, selection # registered filters
|
|
5
|
+
from evalscope.metrics import metric # registered metrics
|
|
6
|
+
from evalscope.models import model_apis # need for register model apis
|
|
7
|
+
from evalscope.run import run_task
|
|
8
|
+
from .version import __release_datetime__, __version__
|