evalscope 0.16.3__tar.gz → 0.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.16.3/evalscope.egg-info → evalscope-0.17.1}/PKG-INFO +81 -150
- {evalscope-0.16.3 → evalscope-0.17.1}/README.md +73 -43
- evalscope-0.17.1/evalscope/app/app.py +35 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/app/constants.py +1 -0
- evalscope-0.17.1/evalscope/app/ui/__init__.py +20 -0
- evalscope-0.17.1/evalscope/app/ui/app_ui.py +52 -0
- evalscope-0.17.1/evalscope/app/ui/multi_model.py +323 -0
- evalscope-0.17.1/evalscope/app/ui/sidebar.py +42 -0
- evalscope-0.17.1/evalscope/app/ui/single_model.py +202 -0
- evalscope-0.17.1/evalscope/app/ui/visualization.py +36 -0
- evalscope-0.17.1/evalscope/app/utils/data_utils.py +178 -0
- evalscope-0.17.1/evalscope/app/utils/localization.py +221 -0
- evalscope-0.17.1/evalscope/app/utils/text_utils.py +119 -0
- evalscope-0.17.1/evalscope/app/utils/visualization.py +91 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/backend_manager.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/backend_manager.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/embedding.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/__init__.py +15 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arc/arc_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/utils.py +0 -12
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/data_adapter.py +29 -9
- evalscope-0.17.1/evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope-0.17.1/evalscope/benchmarks/general_arena/utils.py +226 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope-0.17.1/evalscope/benchmarks/hle/hle_adapter.py +118 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/musr/musr_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope-0.17.1/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/utils.py +2 -2
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/config.py +8 -123
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/constants.py +5 -21
- evalscope-0.17.1/evalscope/evaluator/__init__.py +3 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/evaluator/evaluator.py +20 -15
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/__init__.py +9 -1
- evalscope-0.16.3/evalscope/utils/utils.py → evalscope-0.17.1/evalscope/metrics/completion_parsers.py +71 -176
- evalscope-0.17.1/evalscope/metrics/llm_judge.py +197 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/metrics.py +20 -8
- {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/third_party/thinkbench/tools → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/__init__.py +4 -8
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/__init__.py +4 -9
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/base_adapter.py +4 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/bfcl_adapter.py +2 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/chat_adapter.py +3 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/choice_adapter.py +4 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/custom_adapter.py +7 -3
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/server_adapter.py +4 -2
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope-0.17.1/evalscope/models/adapters/tau_bench_adapter.py +189 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/custom/dummy_model.py +3 -3
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/register.py +0 -14
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/arguments.py +15 -16
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/benchmark.py +38 -39
- evalscope-0.17.1/evalscope/perf/http_client.py +120 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/main.py +3 -3
- evalscope-0.17.1/evalscope/perf/plugin/__init__.py +3 -0
- evalscope-0.17.1/evalscope/perf/plugin/api/__init__.py +4 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/api/base.py +22 -4
- evalscope-0.17.1/evalscope/perf/plugin/api/custom_api.py +249 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope-0.17.1/evalscope/perf/plugin/api/default_api.py +105 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope-0.17.1/evalscope/perf/plugin/datasets/__init__.py +10 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/base.py +22 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/custom.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope-0.17.1/evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/openqa.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope-0.17.1/evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope-0.17.1/evalscope/perf/plugin/registry.py +74 -0
- evalscope-0.17.1/evalscope/perf/utils/__init__.py +0 -0
- evalscope-0.17.1/evalscope/perf/utils/analysis_result.py +30 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/benchmark_util.py +14 -20
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/db_util.py +79 -61
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/__init__.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/utils.py +34 -15
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/run.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/summarizer.py +1 -2
- evalscope-0.17.1/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope-0.17.1/evalscope/utils/__init__.py +65 -0
- evalscope-0.17.1/evalscope/utils/argument_utils.py +64 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/import_utils.py +16 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/io_utils.py +55 -4
- evalscope-0.17.1/evalscope/utils/model_utils.py +76 -0
- evalscope-0.17.1/evalscope/version.py +4 -0
- {evalscope-0.16.3 → evalscope-0.17.1/evalscope.egg-info}/PKG-INFO +81 -150
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/SOURCES.txt +27 -30
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/requires.txt +24 -4
- evalscope-0.17.1/requirements/dev.txt +5 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/framework.txt +2 -2
- {evalscope-0.16.3 → evalscope-0.17.1}/setup.cfg +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/setup.py +35 -15
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/aigc/test_t2i.py +1 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/cli/test_all.py +68 -4
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/cli/test_collection.py +1 -1
- evalscope-0.17.1/tests/cli/test_custom.py +261 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/cli/test_run.py +34 -70
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/perf/test_perf.py +31 -4
- evalscope-0.17.1/tests/rag/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/rag/test_clip_benchmark.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/rag/test_mteb.py +3 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/rag/test_ragas.py +3 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/swift/test_run_swift_eval.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_eval.py +2 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- evalscope-0.17.1/tests/utils.py +13 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/vlm/test_vlmeval.py +8 -2
- evalscope-0.16.3/evalscope/app/app.py +0 -788
- evalscope-0.16.3/evalscope/evaluator/__init__.py +0 -3
- evalscope-0.16.3/evalscope/evaluator/rating_eval.py +0 -157
- evalscope-0.16.3/evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope-0.16.3/evalscope/metrics/llm_judge.py +0 -111
- evalscope-0.16.3/evalscope/models/model.py +0 -189
- evalscope-0.16.3/evalscope/perf/http_client.py +0 -176
- evalscope-0.16.3/evalscope/perf/plugin/__init__.py +0 -2
- evalscope-0.16.3/evalscope/perf/plugin/api/__init__.py +0 -3
- evalscope-0.16.3/evalscope/perf/plugin/api/custom_api.py +0 -92
- evalscope-0.16.3/evalscope/perf/plugin/datasets/__init__.py +0 -7
- evalscope-0.16.3/evalscope/perf/plugin/registry.py +0 -54
- evalscope-0.16.3/evalscope/perf/utils/analysis_result.py +0 -29
- evalscope-0.16.3/evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope-0.16.3/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope-0.16.3/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope-0.16.3/evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope-0.16.3/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope-0.16.3/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope-0.16.3/evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope-0.16.3/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope-0.16.3/evalscope/registry/data/question.jsonl +0 -80
- evalscope-0.16.3/evalscope/registry/tasks/arc.yaml +0 -28
- evalscope-0.16.3/evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope-0.16.3/evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope-0.16.3/evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope-0.16.3/evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope-0.16.3/evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope-0.16.3/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope-0.16.3/evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope-0.16.3/evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope-0.16.3/evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope-0.16.3/evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope-0.16.3/evalscope/run_arena.py +0 -202
- evalscope-0.16.3/evalscope/utils/__init__.py +0 -4
- evalscope-0.16.3/evalscope/utils/arena_utils.py +0 -217
- evalscope-0.16.3/evalscope/utils/completion_parsers.py +0 -82
- evalscope-0.16.3/evalscope/utils/model_utils.py +0 -40
- evalscope-0.16.3/evalscope/version.py +0 -4
- evalscope-0.16.3/tests/swift/__init__.py +0 -1
- evalscope-0.16.3/tests/vlm/__init__.py +0 -1
- {evalscope-0.16.3 → evalscope-0.17.1}/LICENSE +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/MANIFEST.in +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/app/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/app/arguments.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/arguments.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/docmath/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/docmath/docmath_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/docmath/utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/drop/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/drop/drop_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/drop/utils.py +0 -0
- {evalscope-0.16.3/evalscope/utils → evalscope-0.17.1/evalscope/benchmarks}/filters.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/frames/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/frames/frames_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/frames/utils.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/general_mcq → evalscope-0.17.1/evalscope/benchmarks/general_arena}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/gpqa → evalscope-0.17.1/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/ifeval → evalscope-0.17.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/iquiz → evalscope-0.17.1/evalscope/benchmarks/hle}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/live_code_bench → evalscope-0.17.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/maritime_bench → evalscope-0.17.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/math_500 → evalscope-0.17.1/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/mmlu_pro → evalscope-0.17.1/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/mmlu_redux → evalscope-0.17.1/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/musr → evalscope-0.17.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/needle_haystack → evalscope-0.17.1/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/process_bench → evalscope-0.17.1/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/simple_qa → evalscope-0.17.1/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/super_gpqa → evalscope-0.17.1/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/tool_bench → evalscope-0.17.1/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
- {evalscope-0.16.3/evalscope/benchmarks/winogrande → evalscope-0.17.1/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models → evalscope-0.17.1/evalscope/benchmarks/tau_bench}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.17.1/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.17.1/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/evaluator.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/schema.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.16.3/evalscope/perf → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/perf/utils → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.16.3/tests/rag → evalscope-0.17.1/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/log_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/rich_display.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/combinator.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/generator.py +0 -0
- {evalscope-0.16.3/evalscope/evaluator/reviewer → evalscope-0.17.1/evalscope/third_party}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.16.3/evalscope/registry → evalscope-0.17.1/evalscope/third_party/longbench_write/resources}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.16.3/evalscope/third_party → evalscope-0.17.1/evalscope/third_party/longbench_write/tools}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.16.3/evalscope/third_party/longbench_write/resources → evalscope-0.17.1/evalscope/third_party/toolbench_static/llm}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/deprecation_utils.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/logger.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/aigc.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/app.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/docs.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/opencompass.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/perf.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/rag.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements/vlmeval.txt +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/requirements.txt +0 -0
- {evalscope-0.16.3/evalscope/third_party/longbench_write/tools → evalscope-0.17.1/tests}/__init__.py +0 -0
- {evalscope-0.16.3/evalscope/third_party/toolbench_static/llm → evalscope-0.17.1/tests/aigc}/__init__.py +0 -0
- {evalscope-0.16.3/tests → evalscope-0.17.1/tests/cli}/__init__.py +0 -0
- {evalscope-0.16.3/tests/aigc → evalscope-0.17.1/tests/perf}/__init__.py +0 -0
- {evalscope-0.16.3/tests/cli → evalscope-0.17.1/tests/swift}/__init__.py +0 -0
- {evalscope-0.16.3 → evalscope-0.17.1}/tests/test_run_all.py +0 -0
- {evalscope-0.16.3/tests/perf → evalscope-0.17.1/tests/vlm}/__init__.py +0 -0
|
@@ -1,130 +1,31 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.17.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
7
7
|
Author-email: contact@modelscope.cn
|
|
8
|
+
License: Apache License 2.0
|
|
8
9
|
Keywords: python,llm,evaluation
|
|
9
10
|
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/markdown
|
|
18
|
-
License-File: LICENSE
|
|
19
|
-
Requires-Dist: accelerate
|
|
20
|
-
Requires-Dist: datasets>=3.0
|
|
21
|
-
Requires-Dist: immutabledict
|
|
22
|
-
Requires-Dist: jieba
|
|
23
|
-
Requires-Dist: jsonlines
|
|
24
|
-
Requires-Dist: langdetect
|
|
25
|
-
Requires-Dist: latex2sympy2_extended
|
|
26
|
-
Requires-Dist: matplotlib
|
|
27
|
-
Requires-Dist: modelscope[framework]
|
|
28
|
-
Requires-Dist: nltk>=3.9
|
|
29
|
-
Requires-Dist: openai
|
|
30
|
-
Requires-Dist: pandas
|
|
31
|
-
Requires-Dist: pillow
|
|
32
|
-
Requires-Dist: pyarrow
|
|
33
|
-
Requires-Dist: pyyaml>=5.1
|
|
34
|
-
Requires-Dist: requests
|
|
35
|
-
Requires-Dist: rouge-chinese
|
|
36
|
-
Requires-Dist: rouge-score>=0.1.0
|
|
37
|
-
Requires-Dist: sacrebleu
|
|
38
|
-
Requires-Dist: scikit-learn
|
|
39
|
-
Requires-Dist: seaborn
|
|
40
|
-
Requires-Dist: sympy
|
|
41
|
-
Requires-Dist: tabulate
|
|
42
|
-
Requires-Dist: torch
|
|
43
|
-
Requires-Dist: tqdm
|
|
44
|
-
Requires-Dist: transformers>=4.33
|
|
45
|
-
Requires-Dist: word2number
|
|
46
19
|
Provides-Extra: opencompass
|
|
47
|
-
Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
|
|
48
20
|
Provides-Extra: vlmeval
|
|
49
|
-
Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
50
21
|
Provides-Extra: rag
|
|
51
|
-
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
52
|
-
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
53
|
-
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
54
|
-
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
55
|
-
Requires-Dist: mteb==1.38.20; extra == "rag"
|
|
56
|
-
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
57
|
-
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
58
22
|
Provides-Extra: perf
|
|
59
|
-
Requires-Dist: aiohttp; extra == "perf"
|
|
60
|
-
Requires-Dist: fastapi; extra == "perf"
|
|
61
|
-
Requires-Dist: numpy; extra == "perf"
|
|
62
|
-
Requires-Dist: rich; extra == "perf"
|
|
63
|
-
Requires-Dist: sse_starlette; extra == "perf"
|
|
64
|
-
Requires-Dist: transformers; extra == "perf"
|
|
65
|
-
Requires-Dist: uvicorn; extra == "perf"
|
|
66
23
|
Provides-Extra: app
|
|
67
|
-
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
68
|
-
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
69
24
|
Provides-Extra: aigc
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
Requires-Dist: omegaconf; extra == "aigc"
|
|
73
|
-
Requires-Dist: open_clip_torch; extra == "aigc"
|
|
74
|
-
Requires-Dist: opencv-python; extra == "aigc"
|
|
75
|
-
Requires-Dist: torchvision; extra == "aigc"
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Provides-Extra: docs
|
|
76
27
|
Provides-Extra: all
|
|
77
|
-
|
|
78
|
-
Requires-Dist: datasets>=3.0; extra == "all"
|
|
79
|
-
Requires-Dist: immutabledict; extra == "all"
|
|
80
|
-
Requires-Dist: jieba; extra == "all"
|
|
81
|
-
Requires-Dist: jsonlines; extra == "all"
|
|
82
|
-
Requires-Dist: langdetect; extra == "all"
|
|
83
|
-
Requires-Dist: latex2sympy2_extended; extra == "all"
|
|
84
|
-
Requires-Dist: matplotlib; extra == "all"
|
|
85
|
-
Requires-Dist: modelscope[framework]; extra == "all"
|
|
86
|
-
Requires-Dist: nltk>=3.9; extra == "all"
|
|
87
|
-
Requires-Dist: openai; extra == "all"
|
|
88
|
-
Requires-Dist: pandas; extra == "all"
|
|
89
|
-
Requires-Dist: pillow; extra == "all"
|
|
90
|
-
Requires-Dist: pyarrow; extra == "all"
|
|
91
|
-
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
92
|
-
Requires-Dist: requests; extra == "all"
|
|
93
|
-
Requires-Dist: rouge-chinese; extra == "all"
|
|
94
|
-
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
95
|
-
Requires-Dist: sacrebleu; extra == "all"
|
|
96
|
-
Requires-Dist: scikit-learn; extra == "all"
|
|
97
|
-
Requires-Dist: seaborn; extra == "all"
|
|
98
|
-
Requires-Dist: sympy; extra == "all"
|
|
99
|
-
Requires-Dist: tabulate; extra == "all"
|
|
100
|
-
Requires-Dist: torch; extra == "all"
|
|
101
|
-
Requires-Dist: tqdm; extra == "all"
|
|
102
|
-
Requires-Dist: transformers>=4.33; extra == "all"
|
|
103
|
-
Requires-Dist: word2number; extra == "all"
|
|
104
|
-
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
105
|
-
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
106
|
-
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
107
|
-
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
108
|
-
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
109
|
-
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
110
|
-
Requires-Dist: mteb==1.38.20; extra == "all"
|
|
111
|
-
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
112
|
-
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
113
|
-
Requires-Dist: aiohttp; extra == "all"
|
|
114
|
-
Requires-Dist: fastapi; extra == "all"
|
|
115
|
-
Requires-Dist: numpy; extra == "all"
|
|
116
|
-
Requires-Dist: rich; extra == "all"
|
|
117
|
-
Requires-Dist: sse_starlette; extra == "all"
|
|
118
|
-
Requires-Dist: transformers; extra == "all"
|
|
119
|
-
Requires-Dist: uvicorn; extra == "all"
|
|
120
|
-
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
121
|
-
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
122
|
-
Requires-Dist: diffusers; extra == "all"
|
|
123
|
-
Requires-Dist: iopath; extra == "all"
|
|
124
|
-
Requires-Dist: omegaconf; extra == "all"
|
|
125
|
-
Requires-Dist: open_clip_torch; extra == "all"
|
|
126
|
-
Requires-Dist: opencv-python; extra == "all"
|
|
127
|
-
Requires-Dist: torchvision; extra == "all"
|
|
28
|
+
License-File: LICENSE
|
|
128
29
|
|
|
129
30
|
<p align="center">
|
|
130
31
|
<br>
|
|
@@ -165,16 +66,17 @@ Requires-Dist: torchvision; extra == "all"
|
|
|
165
66
|
- [Basic Parameter](#basic-parameter)
|
|
166
67
|
- [Output Results](#output-results)
|
|
167
68
|
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
168
|
-
- [🌐 Evaluation of
|
|
69
|
+
- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
|
|
169
70
|
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
170
|
-
- [Parameter](#parameter)
|
|
171
|
-
- [Evaluation
|
|
71
|
+
- [Parameter Description](#parameter-description)
|
|
72
|
+
- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
|
|
172
73
|
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
173
74
|
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
174
|
-
- [
|
|
75
|
+
- [⚔️ Arena Mode](#️-arena-mode)
|
|
175
76
|
- [👷♂️ Contribution](#️-contribution)
|
|
77
|
+
- [📚 Citation](#-citation)
|
|
176
78
|
- [🔜 Roadmap](#-roadmap)
|
|
177
|
-
- [Star History](
|
|
79
|
+
- [⭐ Star History](#-star-history)
|
|
178
80
|
|
|
179
81
|
|
|
180
82
|
## 📝 Introduction
|
|
@@ -198,24 +100,33 @@ EvalScope is not merely an evaluation tool; it is a valuable ally in your model
|
|
|
198
100
|
Below is the overall architecture diagram of EvalScope:
|
|
199
101
|
|
|
200
102
|
<p align="center">
|
|
201
|
-
<img src="
|
|
103
|
+
<img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
|
|
202
104
|
<br>EvalScope Framework.
|
|
203
105
|
</p>
|
|
204
106
|
|
|
205
107
|
<details><summary>Framework Description</summary>
|
|
206
108
|
|
|
207
109
|
The architecture includes the following modules:
|
|
208
|
-
1.
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
110
|
+
1. Input Layer
|
|
111
|
+
- **Model Sources**: API models (OpenAI API), local models (ModelScope)
|
|
112
|
+
- **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
|
|
113
|
+
|
|
114
|
+
2. Core Functions
|
|
115
|
+
- **Multi-backend Evaluation**
|
|
116
|
+
- Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
|
|
117
|
+
- Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
|
|
118
|
+
|
|
119
|
+
- **Performance Monitoring**
|
|
120
|
+
- Model plugins: Supports various model service APIs
|
|
121
|
+
- Data plugins: Supports multiple data formats
|
|
122
|
+
- Metric tracking: TTFT/TPOP/Stability and other metrics
|
|
123
|
+
|
|
124
|
+
- **Tool Extensions**
|
|
125
|
+
- Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
|
|
126
|
+
|
|
127
|
+
3. Output Layer
|
|
128
|
+
- **Structured Reports**: Supports JSON/Tables/Logs
|
|
129
|
+
- **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
|
|
219
130
|
|
|
220
131
|
</details>
|
|
221
132
|
|
|
@@ -229,8 +140,12 @@ Please scan the QR code below to join our community groups:
|
|
|
229
140
|
|
|
230
141
|
|
|
231
142
|
## 🎉 News
|
|
232
|
-
|
|
233
|
-
- 🔥 **[2025.
|
|
143
|
+
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
144
|
+
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
145
|
+
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
146
|
+
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
147
|
+
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
148
|
+
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
234
149
|
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
235
150
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
236
151
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
@@ -239,6 +154,8 @@ Please scan the QR code below to join our community groups:
|
|
|
239
154
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
240
155
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
241
156
|
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
157
|
+
<details><summary>More</summary>
|
|
158
|
+
|
|
242
159
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
243
160
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
244
161
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -252,8 +169,6 @@ Please scan the QR code below to join our community groups:
|
|
|
252
169
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
253
170
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
254
171
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
255
|
-
<details><summary>More</summary>
|
|
256
|
-
|
|
257
172
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
258
173
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
259
174
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -345,33 +260,31 @@ evalscope eval \
|
|
|
345
260
|
|
|
346
261
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
347
262
|
|
|
348
|
-
**Using
|
|
263
|
+
**Using `TaskConfig`**
|
|
349
264
|
|
|
350
265
|
```python
|
|
351
|
-
from evalscope
|
|
266
|
+
from evalscope import run_task, TaskConfig
|
|
352
267
|
|
|
353
|
-
task_cfg =
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
268
|
+
task_cfg = TaskConfig(
|
|
269
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
270
|
+
datasets=['gsm8k', 'arc'],
|
|
271
|
+
limit=5
|
|
272
|
+
)
|
|
358
273
|
|
|
359
274
|
run_task(task_cfg=task_cfg)
|
|
360
275
|
```
|
|
361
|
-
|
|
362
276
|
<details><summary>More Startup Methods</summary>
|
|
363
277
|
|
|
364
|
-
**Using
|
|
278
|
+
**Using Python Dictionary**
|
|
365
279
|
|
|
366
280
|
```python
|
|
367
281
|
from evalscope.run import run_task
|
|
368
|
-
from evalscope.config import TaskConfig
|
|
369
282
|
|
|
370
|
-
task_cfg =
|
|
371
|
-
model
|
|
372
|
-
datasets
|
|
373
|
-
limit
|
|
374
|
-
|
|
283
|
+
task_cfg = {
|
|
284
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
285
|
+
'datasets': ['gsm8k', 'arc'],
|
|
286
|
+
'limit': 5
|
|
287
|
+
}
|
|
375
288
|
|
|
376
289
|
run_task(task_cfg=task_cfg)
|
|
377
290
|
```
|
|
@@ -474,7 +387,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
474
387
|
|
|
475
388
|
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
476
389
|
|
|
477
|
-
## 🌐 Evaluation of
|
|
390
|
+
## 🌐 Evaluation of Model API
|
|
478
391
|
|
|
479
392
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
480
393
|
|
|
@@ -525,7 +438,7 @@ evalscope eval \
|
|
|
525
438
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
526
439
|
|
|
527
440
|
|
|
528
|
-
## Evaluation
|
|
441
|
+
## 🧪 Other Evaluation Backends
|
|
529
442
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
530
443
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
531
444
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -572,10 +485,17 @@ Speed Benchmark Results:
|
|
|
572
485
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
573
486
|
|
|
574
487
|
|
|
575
|
-
##
|
|
576
|
-
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
488
|
+
## ⚔️ Arena Mode
|
|
577
489
|
|
|
578
|
-
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
490
|
+
Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
491
|
+
|
|
492
|
+
```text
|
|
493
|
+
Model WinRate (%) CI (%)
|
|
494
|
+
------------ ------------- ---------------
|
|
495
|
+
qwen2.5-72b 69.3 (-13.3 / +12.2)
|
|
496
|
+
qwen2.5-7b 50 (+0.0 / +0.0)
|
|
497
|
+
qwen2.5-0.5b 4.7 (-2.5 / +4.4)
|
|
498
|
+
```
|
|
579
499
|
|
|
580
500
|
## 👷♂️ Contribution
|
|
581
501
|
|
|
@@ -591,6 +511,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
591
511
|
</table>
|
|
592
512
|
</a>
|
|
593
513
|
|
|
514
|
+
## 📚 Citation
|
|
515
|
+
|
|
516
|
+
```bibtex
|
|
517
|
+
@misc{evalscope_2024,
|
|
518
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
519
|
+
author={ModelScope Team},
|
|
520
|
+
year={2024},
|
|
521
|
+
url={https://github.com/modelscope/evalscope}
|
|
522
|
+
}
|
|
523
|
+
```
|
|
524
|
+
|
|
594
525
|
## 🔜 Roadmap
|
|
595
526
|
- [x] Support for better evaluation report visualization
|
|
596
527
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -601,11 +532,11 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
601
532
|
- [ ] Distributed evaluating
|
|
602
533
|
- [x] Multi-modal evaluation
|
|
603
534
|
- [ ] Benchmarks
|
|
604
|
-
- [
|
|
535
|
+
- [x] BFCL-v3
|
|
605
536
|
- [x] GPQA
|
|
606
537
|
- [x] MBPP
|
|
607
538
|
|
|
608
539
|
|
|
609
|
-
## Star History
|
|
540
|
+
## ⭐ Star History
|
|
610
541
|
|
|
611
542
|
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -37,16 +37,17 @@
|
|
|
37
37
|
- [Basic Parameter](#basic-parameter)
|
|
38
38
|
- [Output Results](#output-results)
|
|
39
39
|
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
40
|
-
- [🌐 Evaluation of
|
|
40
|
+
- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
|
|
41
41
|
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
42
|
-
- [Parameter](#parameter)
|
|
43
|
-
- [Evaluation
|
|
42
|
+
- [Parameter Description](#parameter-description)
|
|
43
|
+
- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
|
|
44
44
|
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
45
45
|
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
46
|
-
- [
|
|
46
|
+
- [⚔️ Arena Mode](#️-arena-mode)
|
|
47
47
|
- [👷♂️ Contribution](#️-contribution)
|
|
48
|
+
- [📚 Citation](#-citation)
|
|
48
49
|
- [🔜 Roadmap](#-roadmap)
|
|
49
|
-
- [Star History](
|
|
50
|
+
- [⭐ Star History](#-star-history)
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
## 📝 Introduction
|
|
@@ -70,24 +71,33 @@ EvalScope is not merely an evaluation tool; it is a valuable ally in your model
|
|
|
70
71
|
Below is the overall architecture diagram of EvalScope:
|
|
71
72
|
|
|
72
73
|
<p align="center">
|
|
73
|
-
<img src="
|
|
74
|
+
<img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
|
|
74
75
|
<br>EvalScope Framework.
|
|
75
76
|
</p>
|
|
76
77
|
|
|
77
78
|
<details><summary>Framework Description</summary>
|
|
78
79
|
|
|
79
80
|
The architecture includes the following modules:
|
|
80
|
-
1.
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
81
|
+
1. Input Layer
|
|
82
|
+
- **Model Sources**: API models (OpenAI API), local models (ModelScope)
|
|
83
|
+
- **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
|
|
84
|
+
|
|
85
|
+
2. Core Functions
|
|
86
|
+
- **Multi-backend Evaluation**
|
|
87
|
+
- Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
|
|
88
|
+
- Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
|
|
89
|
+
|
|
90
|
+
- **Performance Monitoring**
|
|
91
|
+
- Model plugins: Supports various model service APIs
|
|
92
|
+
- Data plugins: Supports multiple data formats
|
|
93
|
+
- Metric tracking: TTFT/TPOP/Stability and other metrics
|
|
94
|
+
|
|
95
|
+
- **Tool Extensions**
|
|
96
|
+
- Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
|
|
97
|
+
|
|
98
|
+
3. Output Layer
|
|
99
|
+
- **Structured Reports**: Supports JSON/Tables/Logs
|
|
100
|
+
- **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
|
|
91
101
|
|
|
92
102
|
</details>
|
|
93
103
|
|
|
@@ -101,8 +111,12 @@ Please scan the QR code below to join our community groups:
|
|
|
101
111
|
|
|
102
112
|
|
|
103
113
|
## 🎉 News
|
|
104
|
-
|
|
105
|
-
- 🔥 **[2025.
|
|
114
|
+
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
115
|
+
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
116
|
+
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
117
|
+
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
118
|
+
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
119
|
+
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
106
120
|
- 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
|
|
107
121
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
108
122
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
@@ -111,6 +125,8 @@ Please scan the QR code below to join our community groups:
|
|
|
111
125
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
112
126
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
113
127
|
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
128
|
+
<details><summary>More</summary>
|
|
129
|
+
|
|
114
130
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
115
131
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
116
132
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -124,8 +140,6 @@ Please scan the QR code below to join our community groups:
|
|
|
124
140
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
125
141
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
126
142
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
127
|
-
<details><summary>More</summary>
|
|
128
|
-
|
|
129
143
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
130
144
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
131
145
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -217,33 +231,31 @@ evalscope eval \
|
|
|
217
231
|
|
|
218
232
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
219
233
|
|
|
220
|
-
**Using
|
|
234
|
+
**Using `TaskConfig`**
|
|
221
235
|
|
|
222
236
|
```python
|
|
223
|
-
from evalscope
|
|
237
|
+
from evalscope import run_task, TaskConfig
|
|
224
238
|
|
|
225
|
-
task_cfg =
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
239
|
+
task_cfg = TaskConfig(
|
|
240
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
241
|
+
datasets=['gsm8k', 'arc'],
|
|
242
|
+
limit=5
|
|
243
|
+
)
|
|
230
244
|
|
|
231
245
|
run_task(task_cfg=task_cfg)
|
|
232
246
|
```
|
|
233
|
-
|
|
234
247
|
<details><summary>More Startup Methods</summary>
|
|
235
248
|
|
|
236
|
-
**Using
|
|
249
|
+
**Using Python Dictionary**
|
|
237
250
|
|
|
238
251
|
```python
|
|
239
252
|
from evalscope.run import run_task
|
|
240
|
-
from evalscope.config import TaskConfig
|
|
241
253
|
|
|
242
|
-
task_cfg =
|
|
243
|
-
model
|
|
244
|
-
datasets
|
|
245
|
-
limit
|
|
246
|
-
|
|
254
|
+
task_cfg = {
|
|
255
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
256
|
+
'datasets': ['gsm8k', 'arc'],
|
|
257
|
+
'limit': 5
|
|
258
|
+
}
|
|
247
259
|
|
|
248
260
|
run_task(task_cfg=task_cfg)
|
|
249
261
|
```
|
|
@@ -346,7 +358,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
346
358
|
|
|
347
359
|
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
348
360
|
|
|
349
|
-
## 🌐 Evaluation of
|
|
361
|
+
## 🌐 Evaluation of Model API
|
|
350
362
|
|
|
351
363
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
352
364
|
|
|
@@ -397,7 +409,7 @@ evalscope eval \
|
|
|
397
409
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
398
410
|
|
|
399
411
|
|
|
400
|
-
## Evaluation
|
|
412
|
+
## 🧪 Other Evaluation Backends
|
|
401
413
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
402
414
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
403
415
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -444,10 +456,17 @@ Speed Benchmark Results:
|
|
|
444
456
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
445
457
|
|
|
446
458
|
|
|
447
|
-
##
|
|
448
|
-
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
459
|
+
## ⚔️ Arena Mode
|
|
449
460
|
|
|
450
|
-
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
461
|
+
Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
462
|
+
|
|
463
|
+
```text
|
|
464
|
+
Model WinRate (%) CI (%)
|
|
465
|
+
------------ ------------- ---------------
|
|
466
|
+
qwen2.5-72b 69.3 (-13.3 / +12.2)
|
|
467
|
+
qwen2.5-7b 50 (+0.0 / +0.0)
|
|
468
|
+
qwen2.5-0.5b 4.7 (-2.5 / +4.4)
|
|
469
|
+
```
|
|
451
470
|
|
|
452
471
|
## 👷♂️ Contribution
|
|
453
472
|
|
|
@@ -463,6 +482,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
463
482
|
</table>
|
|
464
483
|
</a>
|
|
465
484
|
|
|
485
|
+
## 📚 Citation
|
|
486
|
+
|
|
487
|
+
```bibtex
|
|
488
|
+
@misc{evalscope_2024,
|
|
489
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
490
|
+
author={ModelScope Team},
|
|
491
|
+
year={2024},
|
|
492
|
+
url={https://github.com/modelscope/evalscope}
|
|
493
|
+
}
|
|
494
|
+
```
|
|
495
|
+
|
|
466
496
|
## 🔜 Roadmap
|
|
467
497
|
- [x] Support for better evaluation report visualization
|
|
468
498
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -473,11 +503,11 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
473
503
|
- [ ] Distributed evaluating
|
|
474
504
|
- [x] Multi-modal evaluation
|
|
475
505
|
- [ ] Benchmarks
|
|
476
|
-
- [
|
|
506
|
+
- [x] BFCL-v3
|
|
477
507
|
- [x] GPQA
|
|
478
508
|
- [x] MBPP
|
|
479
509
|
|
|
480
510
|
|
|
481
|
-
## Star History
|
|
511
|
+
## ⭐ Star History
|
|
482
512
|
|
|
483
513
|
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main application module for the Evalscope dashboard.
|
|
3
|
+
"""
|
|
4
|
+
import argparse
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import configure_logging
|
|
7
|
+
from .arguments import add_argument
|
|
8
|
+
from .ui import create_app_ui
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def create_app(args: argparse.Namespace):
|
|
12
|
+
"""
|
|
13
|
+
Create and launch the Evalscope dashboard application.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
args: Command line arguments.
|
|
17
|
+
"""
|
|
18
|
+
configure_logging(debug=args.debug)
|
|
19
|
+
|
|
20
|
+
demo = create_app_ui(args)
|
|
21
|
+
|
|
22
|
+
demo.launch(
|
|
23
|
+
share=args.share,
|
|
24
|
+
server_name=args.server_name,
|
|
25
|
+
server_port=args.server_port,
|
|
26
|
+
debug=args.debug,
|
|
27
|
+
allowed_paths=args.allowed_paths,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
if __name__ == '__main__':
|
|
32
|
+
parser = argparse.ArgumentParser()
|
|
33
|
+
add_argument(parser)
|
|
34
|
+
args = parser.parse_args()
|
|
35
|
+
create_app(args)
|