evalscope 0.16.2__tar.gz → 0.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.16.2/evalscope.egg-info → evalscope-0.17.0}/PKG-INFO +40 -123
- {evalscope-0.16.2 → evalscope-0.17.0}/README.md +37 -19
- evalscope-0.17.0/evalscope/app/app.py +35 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/app/constants.py +1 -0
- evalscope-0.17.0/evalscope/app/ui/__init__.py +20 -0
- evalscope-0.17.0/evalscope/app/ui/app_ui.py +52 -0
- evalscope-0.17.0/evalscope/app/ui/multi_model.py +323 -0
- evalscope-0.17.0/evalscope/app/ui/sidebar.py +42 -0
- evalscope-0.17.0/evalscope/app/ui/single_model.py +202 -0
- evalscope-0.17.0/evalscope/app/ui/visualization.py +36 -0
- evalscope-0.17.0/evalscope/app/utils/data_utils.py +178 -0
- evalscope-0.17.0/evalscope/app/utils/localization.py +221 -0
- evalscope-0.17.0/evalscope/app/utils/text_utils.py +119 -0
- evalscope-0.17.0/evalscope/app/utils/visualization.py +91 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/opencompass/backend_manager.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/backend_manager.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/utils/embedding.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/__init__.py +15 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/arc/arc_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/arena_hard/utils.py +0 -12
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/data_adapter.py +20 -5
- evalscope-0.17.0/evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope-0.17.0/evalscope/benchmarks/general_arena/utils.py +226 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/musr/musr_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/race/race_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/utils.py +1 -2
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/config.py +8 -123
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/evaluator/evaluator.py +15 -12
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/__init__.py +6 -0
- evalscope-0.16.2/evalscope/utils/utils.py → evalscope-0.17.0/evalscope/metrics/completion_parsers.py +68 -180
- evalscope-0.17.0/evalscope/metrics/llm_judge.py +196 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/metrics.py +1 -1
- {evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/perf → evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/base_adapter.py +0 -2
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/server_adapter.py +2 -2
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/custom/dummy_model.py +3 -3
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/arguments.py +2 -16
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/main.py +1 -1
- evalscope-0.17.0/evalscope/perf/utils/analysis_result.py +30 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/utils/benchmark_util.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/report/__init__.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/report/utils.py +34 -15
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/run.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/summarizer.py +1 -2
- evalscope-0.17.0/evalscope/utils/__init__.py +65 -0
- evalscope-0.17.0/evalscope/utils/argument_utils.py +64 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/utils/import_utils.py +16 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/utils/io_utils.py +45 -4
- evalscope-0.17.0/evalscope/utils/model_utils.py +76 -0
- evalscope-0.17.0/evalscope/version.py +4 -0
- {evalscope-0.16.2 → evalscope-0.17.0/evalscope.egg-info}/PKG-INFO +40 -123
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope.egg-info/SOURCES.txt +19 -29
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope.egg-info/requires.txt +18 -6
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/app.txt +1 -1
- evalscope-0.17.0/requirements/dev.txt +5 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/framework.txt +2 -2
- {evalscope-0.16.2 → evalscope-0.17.0}/setup.py +2 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/aigc/test_t2i.py +1 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/cli/test_all.py +50 -2
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/cli/test_collection.py +1 -1
- evalscope-0.17.0/tests/cli/test_custom.py +261 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/cli/test_run.py +13 -37
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/perf/test_perf.py +2 -2
- evalscope-0.17.0/tests/rag/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/rag/test_clip_benchmark.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/rag/test_mteb.py +3 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/rag/test_ragas.py +3 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/swift/test_run_swift_eval.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/swift/test_run_swift_vlm_eval.py +2 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- evalscope-0.17.0/tests/utils.py +13 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/vlm/test_vlmeval.py +8 -2
- evalscope-0.16.2/evalscope/app/app.py +0 -788
- evalscope-0.16.2/evalscope/evaluator/rating_eval.py +0 -157
- evalscope-0.16.2/evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope-0.16.2/evalscope/metrics/llm_judge.py +0 -111
- evalscope-0.16.2/evalscope/perf/utils/analysis_result.py +0 -29
- evalscope-0.16.2/evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope-0.16.2/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope-0.16.2/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope-0.16.2/evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope-0.16.2/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope-0.16.2/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope-0.16.2/evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope-0.16.2/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope-0.16.2/evalscope/registry/data/question.jsonl +0 -80
- evalscope-0.16.2/evalscope/registry/tasks/arc.yaml +0 -28
- evalscope-0.16.2/evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope-0.16.2/evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope-0.16.2/evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope-0.16.2/evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope-0.16.2/evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope-0.16.2/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope-0.16.2/evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope-0.16.2/evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope-0.16.2/evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope-0.16.2/evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope-0.16.2/evalscope/run_arena.py +0 -202
- evalscope-0.16.2/evalscope/utils/__init__.py +0 -4
- evalscope-0.16.2/evalscope/utils/arena_utils.py +0 -217
- evalscope-0.16.2/evalscope/utils/completion_parsers.py +0 -82
- evalscope-0.16.2/evalscope/utils/model_utils.py +0 -40
- evalscope-0.16.2/evalscope/version.py +0 -4
- evalscope-0.16.2/tests/swift/__init__.py +0 -1
- evalscope-0.16.2/tests/vlm/__init__.py +0 -1
- {evalscope-0.16.2 → evalscope-0.17.0}/LICENSE +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/MANIFEST.in +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/app/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/app/arguments.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/arguments.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/base.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bfcl/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/bfcl/bfcl_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/docmath/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/docmath/docmath_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/docmath/utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/drop/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/drop/drop_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/drop/utils.py +0 -0
- {evalscope-0.16.2/evalscope/utils → evalscope-0.17.0/evalscope/benchmarks}/filters.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/frames/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/frames/frames_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/frames/utils.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/general_mcq → evalscope-0.17.0/evalscope/benchmarks/general_arena}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/gpqa → evalscope-0.17.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/ifeval → evalscope-0.17.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/iquiz → evalscope-0.17.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/live_code_bench → evalscope-0.17.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/maritime_bench → evalscope-0.17.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/math_500 → evalscope-0.17.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/mmlu_pro → evalscope-0.17.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/mmlu_redux → evalscope-0.17.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/musr → evalscope-0.17.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/needle_haystack → evalscope-0.17.0/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/process_bench → evalscope-0.17.0/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/simple_qa → evalscope-0.17.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/super_gpqa → evalscope-0.17.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/tool_bench → evalscope-0.17.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.16.2/evalscope/benchmarks/winogrande → evalscope-0.17.0/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/tool_bench/utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.16.2/evalscope/metrics/t2v_metrics/models → evalscope-0.17.0/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/cli/base.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/cli/cli.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/collections/evaluator.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/collections/schema.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/constants.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.17.0/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/bfcl_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/chat_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/choice_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/custom_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/adapters/t2i_adapter.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/local_model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/model.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/models/register.py +0 -0
- {evalscope-0.16.2/evalscope/perf/utils → evalscope-0.17.0/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/benchmark.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.16.2/evalscope/third_party/thinkbench/tools → evalscope-0.17.0/evalscope/perf/utils}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/utils/log_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/perf/utils/rich_display.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/report/combinator.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/report/generator.py +0 -0
- {evalscope-0.16.2/evalscope/evaluator/reviewer → evalscope-0.17.0/evalscope/third_party}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.16.2/evalscope/registry → evalscope-0.17.0/evalscope/third_party/longbench_write/resources}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.16.2/evalscope/third_party → evalscope-0.17.0/evalscope/third_party/longbench_write/tools}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.16.2/tests/rag → evalscope-0.17.0/evalscope/third_party/thinkbench/tools}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.16.2/evalscope/third_party/longbench_write/resources → evalscope-0.17.0/evalscope/third_party/toolbench_static/llm}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/utils/deprecation_utils.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope/utils/logger.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/aigc.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/docs.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/opencompass.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/perf.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/rag.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements/vlmeval.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/requirements.txt +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/setup.cfg +0 -0
- {evalscope-0.16.2/evalscope/third_party/longbench_write/tools → evalscope-0.17.0/tests}/__init__.py +0 -0
- {evalscope-0.16.2/evalscope/third_party/toolbench_static/llm → evalscope-0.17.0/tests/aigc}/__init__.py +0 -0
- {evalscope-0.16.2/tests → evalscope-0.17.0/tests/cli}/__init__.py +0 -0
- {evalscope-0.16.2/tests/aigc → evalscope-0.17.0/tests/perf}/__init__.py +0 -0
- {evalscope-0.16.2/tests/cli → evalscope-0.17.0/tests/swift}/__init__.py +0 -0
- {evalscope-0.16.2 → evalscope-0.17.0}/tests/test_run_all.py +0 -0
- {evalscope-0.16.2/tests/perf → evalscope-0.17.0/tests/vlm}/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.17.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -15,116 +15,15 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
16
|
Requires-Python: >=3.8
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
|
-
License-File: LICENSE
|
|
19
|
-
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets>=3.0
|
|
21
|
-
Requires-Dist: immutabledict
|
|
22
|
-
Requires-Dist: jieba
|
|
23
|
-
Requires-Dist: jsonlines
|
|
24
|
-
Requires-Dist: langdetect
|
|
25
|
-
Requires-Dist: latex2sympy2_extended
|
|
26
|
-
Requires-Dist: matplotlib
|
|
27
|
-
Requires-Dist: modelscope[framework]
|
|
28
|
-
Requires-Dist: nltk>=3.9
|
|
29
|
-
Requires-Dist: openai
|
|
30
|
-
Requires-Dist: pandas
|
|
31
|
-
Requires-Dist: pillow
|
|
32
|
-
Requires-Dist: pyarrow
|
|
33
|
-
Requires-Dist: pyyaml>=5.1
|
|
34
|
-
Requires-Dist: requests
|
|
35
|
-
Requires-Dist: rouge-chinese
|
|
36
|
-
Requires-Dist: rouge-score>=0.1.0
|
|
37
|
-
Requires-Dist: sacrebleu
|
|
38
|
-
Requires-Dist: scikit-learn
|
|
39
|
-
Requires-Dist: seaborn
|
|
40
|
-
Requires-Dist: sympy
|
|
41
|
-
Requires-Dist: tabulate
|
|
42
|
-
Requires-Dist: torch
|
|
43
|
-
Requires-Dist: tqdm
|
|
44
|
-
Requires-Dist: transformers>=4.33
|
|
45
|
-
Requires-Dist: word2number
|
|
46
18
|
Provides-Extra: opencompass
|
|
47
|
-
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
48
19
|
Provides-Extra: vlmeval
|
|
49
|
-
Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
50
20
|
Provides-Extra: rag
|
|
51
|
-
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
52
|
-
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
53
|
-
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
54
|
-
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
55
|
-
Requires-Dist: mteb==1.38.20; extra == "rag"
|
|
56
|
-
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
57
|
-
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
58
21
|
Provides-Extra: perf
|
|
59
|
-
Requires-Dist: aiohttp; extra == "perf"
|
|
60
|
-
Requires-Dist: fastapi; extra == "perf"
|
|
61
|
-
Requires-Dist: numpy; extra == "perf"
|
|
62
|
-
Requires-Dist: rich; extra == "perf"
|
|
63
|
-
Requires-Dist: sse_starlette; extra == "perf"
|
|
64
|
-
Requires-Dist: transformers; extra == "perf"
|
|
65
|
-
Requires-Dist: uvicorn; extra == "perf"
|
|
66
22
|
Provides-Extra: app
|
|
67
|
-
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
68
|
-
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
69
23
|
Provides-Extra: aigc
|
|
70
|
-
|
|
71
|
-
Requires-Dist: iopath; extra == "aigc"
|
|
72
|
-
Requires-Dist: omegaconf; extra == "aigc"
|
|
73
|
-
Requires-Dist: open_clip_torch; extra == "aigc"
|
|
74
|
-
Requires-Dist: opencv-python; extra == "aigc"
|
|
75
|
-
Requires-Dist: torchvision; extra == "aigc"
|
|
24
|
+
Provides-Extra: dev
|
|
76
25
|
Provides-Extra: all
|
|
77
|
-
|
|
78
|
-
Requires-Dist: datasets>=3.0; extra == "all"
|
|
79
|
-
Requires-Dist: immutabledict; extra == "all"
|
|
80
|
-
Requires-Dist: jieba; extra == "all"
|
|
81
|
-
Requires-Dist: jsonlines; extra == "all"
|
|
82
|
-
Requires-Dist: langdetect; extra == "all"
|
|
83
|
-
Requires-Dist: latex2sympy2_extended; extra == "all"
|
|
84
|
-
Requires-Dist: matplotlib; extra == "all"
|
|
85
|
-
Requires-Dist: modelscope[framework]; extra == "all"
|
|
86
|
-
Requires-Dist: nltk>=3.9; extra == "all"
|
|
87
|
-
Requires-Dist: openai; extra == "all"
|
|
88
|
-
Requires-Dist: pandas; extra == "all"
|
|
89
|
-
Requires-Dist: pillow; extra == "all"
|
|
90
|
-
Requires-Dist: pyarrow; extra == "all"
|
|
91
|
-
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
92
|
-
Requires-Dist: requests; extra == "all"
|
|
93
|
-
Requires-Dist: rouge-chinese; extra == "all"
|
|
94
|
-
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
95
|
-
Requires-Dist: sacrebleu; extra == "all"
|
|
96
|
-
Requires-Dist: scikit-learn; extra == "all"
|
|
97
|
-
Requires-Dist: seaborn; extra == "all"
|
|
98
|
-
Requires-Dist: sympy; extra == "all"
|
|
99
|
-
Requires-Dist: tabulate; extra == "all"
|
|
100
|
-
Requires-Dist: torch; extra == "all"
|
|
101
|
-
Requires-Dist: tqdm; extra == "all"
|
|
102
|
-
Requires-Dist: transformers>=4.33; extra == "all"
|
|
103
|
-
Requires-Dist: word2number; extra == "all"
|
|
104
|
-
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
105
|
-
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
106
|
-
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
107
|
-
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
108
|
-
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
109
|
-
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
110
|
-
Requires-Dist: mteb==1.38.20; extra == "all"
|
|
111
|
-
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
112
|
-
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
113
|
-
Requires-Dist: aiohttp; extra == "all"
|
|
114
|
-
Requires-Dist: fastapi; extra == "all"
|
|
115
|
-
Requires-Dist: numpy; extra == "all"
|
|
116
|
-
Requires-Dist: rich; extra == "all"
|
|
117
|
-
Requires-Dist: sse_starlette; extra == "all"
|
|
118
|
-
Requires-Dist: transformers; extra == "all"
|
|
119
|
-
Requires-Dist: uvicorn; extra == "all"
|
|
120
|
-
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
121
|
-
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
122
|
-
Requires-Dist: diffusers; extra == "all"
|
|
123
|
-
Requires-Dist: iopath; extra == "all"
|
|
124
|
-
Requires-Dist: omegaconf; extra == "all"
|
|
125
|
-
Requires-Dist: open_clip_torch; extra == "all"
|
|
126
|
-
Requires-Dist: opencv-python; extra == "all"
|
|
127
|
-
Requires-Dist: torchvision; extra == "all"
|
|
26
|
+
License-File: LICENSE
|
|
128
27
|
|
|
129
28
|
<p align="center">
|
|
130
29
|
<br>
|
|
@@ -198,24 +97,33 @@ EvalScope is not merely an evaluation tool; it is a valuable ally in your model
|
|
|
198
97
|
Below is the overall architecture diagram of EvalScope:
|
|
199
98
|
|
|
200
99
|
<p align="center">
|
|
201
|
-
<img src="
|
|
100
|
+
<img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
|
|
202
101
|
<br>EvalScope Framework.
|
|
203
102
|
</p>
|
|
204
103
|
|
|
205
104
|
<details><summary>Framework Description</summary>
|
|
206
105
|
|
|
207
106
|
The architecture includes the following modules:
|
|
208
|
-
1.
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
107
|
+
1. Input Layer
|
|
108
|
+
- **Model Sources**: API models (OpenAI API), local models (ModelScope)
|
|
109
|
+
- **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
|
|
110
|
+
|
|
111
|
+
2. Core Functions
|
|
112
|
+
- **Multi-backend Evaluation**
|
|
113
|
+
- Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
|
|
114
|
+
- Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
|
|
115
|
+
|
|
116
|
+
- **Performance Monitoring**
|
|
117
|
+
- Model plugins: Supports various model service APIs
|
|
118
|
+
- Data plugins: Supports multiple data formats
|
|
119
|
+
- Metric tracking: TTFT/TPOP/Stability and other metrics
|
|
120
|
+
|
|
121
|
+
- **Tool Extensions**
|
|
122
|
+
- Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
|
|
123
|
+
|
|
124
|
+
3. Output Layer
|
|
125
|
+
- **Structured Reports**: Supports JSON/Tables/Logs
|
|
126
|
+
- **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
|
|
219
127
|
|
|
220
128
|
</details>
|
|
221
129
|
|
|
@@ -230,7 +138,9 @@ Please scan the QR code below to join our community groups:
|
|
|
230
138
|
|
|
231
139
|
## 🎉 News
|
|
232
140
|
|
|
233
|
-
- 🔥 **[2025.
|
|
141
|
+
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
142
|
+
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
143
|
+
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
234
144
|
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
235
145
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
236
146
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
@@ -248,12 +158,12 @@ Please scan the QR code below to join our community groups:
|
|
|
248
158
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
249
159
|
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
250
160
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
161
|
+
<details><summary>More</summary>
|
|
162
|
+
|
|
251
163
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
252
164
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
253
165
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
254
166
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
255
|
-
<details><summary>More</summary>
|
|
256
|
-
|
|
257
167
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
258
168
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
259
169
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -572,10 +482,17 @@ Speed Benchmark Results:
|
|
|
572
482
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
573
483
|
|
|
574
484
|
|
|
575
|
-
##
|
|
576
|
-
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
485
|
+
## ⚔️ Arena Mode
|
|
577
486
|
|
|
578
|
-
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
487
|
+
Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
488
|
+
|
|
489
|
+
```text
|
|
490
|
+
Model WinRate (%) CI (%)
|
|
491
|
+
------------ ------------- ---------------
|
|
492
|
+
qwen2.5-72b 69.3 (-13.3 / +12.2)
|
|
493
|
+
qwen2.5-7b 50 (+0.0 / +0.0)
|
|
494
|
+
qwen2.5-0.5b 4.7 (-2.5 / +4.4)
|
|
495
|
+
```
|
|
579
496
|
|
|
580
497
|
## 👷♂️ Contribution
|
|
581
498
|
|
|
@@ -601,7 +518,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
601
518
|
- [ ] Distributed evaluating
|
|
602
519
|
- [x] Multi-modal evaluation
|
|
603
520
|
- [ ] Benchmarks
|
|
604
|
-
- [
|
|
521
|
+
- [x] BFCL-v3
|
|
605
522
|
- [x] GPQA
|
|
606
523
|
- [x] MBPP
|
|
607
524
|
|
|
@@ -70,24 +70,33 @@ EvalScope is not merely an evaluation tool; it is a valuable ally in your model
|
|
|
70
70
|
Below is the overall architecture diagram of EvalScope:
|
|
71
71
|
|
|
72
72
|
<p align="center">
|
|
73
|
-
<img src="
|
|
73
|
+
<img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
|
|
74
74
|
<br>EvalScope Framework.
|
|
75
75
|
</p>
|
|
76
76
|
|
|
77
77
|
<details><summary>Framework Description</summary>
|
|
78
78
|
|
|
79
79
|
The architecture includes the following modules:
|
|
80
|
-
1.
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
80
|
+
1. Input Layer
|
|
81
|
+
- **Model Sources**: API models (OpenAI API), local models (ModelScope)
|
|
82
|
+
- **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
|
|
83
|
+
|
|
84
|
+
2. Core Functions
|
|
85
|
+
- **Multi-backend Evaluation**
|
|
86
|
+
- Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
|
|
87
|
+
- Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
|
|
88
|
+
|
|
89
|
+
- **Performance Monitoring**
|
|
90
|
+
- Model plugins: Supports various model service APIs
|
|
91
|
+
- Data plugins: Supports multiple data formats
|
|
92
|
+
- Metric tracking: TTFT/TPOP/Stability and other metrics
|
|
93
|
+
|
|
94
|
+
- **Tool Extensions**
|
|
95
|
+
- Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
|
|
96
|
+
|
|
97
|
+
3. Output Layer
|
|
98
|
+
- **Structured Reports**: Supports JSON/Tables/Logs
|
|
99
|
+
- **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
|
|
91
100
|
|
|
92
101
|
</details>
|
|
93
102
|
|
|
@@ -102,7 +111,9 @@ Please scan the QR code below to join our community groups:
|
|
|
102
111
|
|
|
103
112
|
## 🎉 News
|
|
104
113
|
|
|
105
|
-
- 🔥 **[2025.
|
|
114
|
+
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
115
|
+
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
116
|
+
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
106
117
|
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
107
118
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
108
119
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
@@ -120,12 +131,12 @@ Please scan the QR code below to join our community groups:
|
|
|
120
131
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
121
132
|
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
122
133
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
134
|
+
<details><summary>More</summary>
|
|
135
|
+
|
|
123
136
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
124
137
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
125
138
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
126
139
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
127
|
-
<details><summary>More</summary>
|
|
128
|
-
|
|
129
140
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
130
141
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
131
142
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -444,10 +455,17 @@ Speed Benchmark Results:
|
|
|
444
455
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
445
456
|
|
|
446
457
|
|
|
447
|
-
##
|
|
448
|
-
|
|
458
|
+
## ⚔️ Arena Mode
|
|
459
|
+
|
|
460
|
+
Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
449
461
|
|
|
450
|
-
|
|
462
|
+
```text
|
|
463
|
+
Model WinRate (%) CI (%)
|
|
464
|
+
------------ ------------- ---------------
|
|
465
|
+
qwen2.5-72b 69.3 (-13.3 / +12.2)
|
|
466
|
+
qwen2.5-7b 50 (+0.0 / +0.0)
|
|
467
|
+
qwen2.5-0.5b 4.7 (-2.5 / +4.4)
|
|
468
|
+
```
|
|
451
469
|
|
|
452
470
|
## 👷♂️ Contribution
|
|
453
471
|
|
|
@@ -473,7 +491,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
473
491
|
- [ ] Distributed evaluating
|
|
474
492
|
- [x] Multi-modal evaluation
|
|
475
493
|
- [ ] Benchmarks
|
|
476
|
-
- [
|
|
494
|
+
- [x] BFCL-v3
|
|
477
495
|
- [x] GPQA
|
|
478
496
|
- [x] MBPP
|
|
479
497
|
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main application module for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
import argparse
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import configure_logging
|
|
7
|
+
from .arguments import add_argument
|
|
8
|
+
from .ui import create_app_ui
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def create_app(args: argparse.Namespace):
|
|
12
|
+
"""
|
|
13
|
+
Create and launch the Evalscope dashboard application.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
args: Command line arguments.
|
|
17
|
+
"""
|
|
18
|
+
configure_logging(debug=args.debug)
|
|
19
|
+
|
|
20
|
+
demo = create_app_ui(args)
|
|
21
|
+
|
|
22
|
+
demo.launch(
|
|
23
|
+
share=args.share,
|
|
24
|
+
server_name=args.server_name,
|
|
25
|
+
server_port=args.server_port,
|
|
26
|
+
debug=args.debug,
|
|
27
|
+
allowed_paths=args.allowed_paths,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
if __name__ == '__main__':
|
|
32
|
+
parser = argparse.ArgumentParser()
|
|
33
|
+
add_argument(parser)
|
|
34
|
+
args = parser.parse_args()
|
|
35
|
+
create_app(args)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
UI components for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
from .app_ui import create_app_ui
|
|
5
|
+
from .multi_model import MultiModelComponents, create_multi_model_tab
|
|
6
|
+
from .sidebar import SidebarComponents, create_sidebar
|
|
7
|
+
from .single_model import SingleModelComponents, create_single_model_tab
|
|
8
|
+
from .visualization import VisualizationComponents, create_visualization
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
'create_app_ui',
|
|
12
|
+
'SidebarComponents',
|
|
13
|
+
'create_sidebar',
|
|
14
|
+
'VisualizationComponents',
|
|
15
|
+
'create_visualization',
|
|
16
|
+
'SingleModelComponents',
|
|
17
|
+
'create_single_model_tab',
|
|
18
|
+
'MultiModelComponents',
|
|
19
|
+
'create_multi_model_tab',
|
|
20
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main UI application for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
import argparse
|
|
5
|
+
import gradio as gr
|
|
6
|
+
|
|
7
|
+
from evalscope.version import __version__
|
|
8
|
+
from ..utils.localization import get_app_locale
|
|
9
|
+
from .sidebar import create_sidebar
|
|
10
|
+
from .visualization import create_visualization
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_app_ui(args: argparse.Namespace):
|
|
14
|
+
lang = args.lang
|
|
15
|
+
locale_dict = get_app_locale(lang)
|
|
16
|
+
|
|
17
|
+
with gr.Blocks(title='Evalscope Dashboard') as demo:
|
|
18
|
+
gr.HTML(f'<h1 style="text-align: left;">{locale_dict["title"]} (v{__version__})</h1>')
|
|
19
|
+
with gr.Row():
|
|
20
|
+
with gr.Column(scale=0, min_width=35):
|
|
21
|
+
toggle_btn = gr.Button('<')
|
|
22
|
+
with gr.Column(scale=1):
|
|
23
|
+
gr.HTML(f'<h3 style="text-align: left;">{locale_dict["star_beggar"]}</h3>')
|
|
24
|
+
|
|
25
|
+
with gr.Row():
|
|
26
|
+
with gr.Column(scale=1) as sidebar_column:
|
|
27
|
+
sidebar_visible = gr.State(True)
|
|
28
|
+
sidebar = create_sidebar(args.outputs, lang)
|
|
29
|
+
|
|
30
|
+
with gr.Column(scale=5):
|
|
31
|
+
visualization = create_visualization(sidebar, lang)
|
|
32
|
+
|
|
33
|
+
@sidebar.load_btn.click(
|
|
34
|
+
inputs=[sidebar.reports_dropdown],
|
|
35
|
+
outputs=[visualization.single_model.report_name, visualization.multi_model.multi_report_name])
|
|
36
|
+
def update_displays(reports_dropdown):
|
|
37
|
+
if not reports_dropdown:
|
|
38
|
+
gr.Warning(locale_dict['note'], duration=3)
|
|
39
|
+
return gr.skip()
|
|
40
|
+
|
|
41
|
+
return (
|
|
42
|
+
gr.update(choices=reports_dropdown, value=reports_dropdown[0]), # update single model dropdown
|
|
43
|
+
gr.update(choices=reports_dropdown, value=reports_dropdown) # update multi model dropdown
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@toggle_btn.click(inputs=[sidebar_visible], outputs=[sidebar_column, sidebar_visible, toggle_btn])
|
|
47
|
+
def toggle_sidebar(visible):
|
|
48
|
+
new_visible = not visible
|
|
49
|
+
text = '<' if new_visible else '>'
|
|
50
|
+
return gr.update(visible=new_visible), new_visible, gr.update(value=text)
|
|
51
|
+
|
|
52
|
+
return demo
|