evalscope 0.17.0__tar.gz → 0.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.17.0/evalscope.egg-info → evalscope-0.17.1}/PKG-INFO +44 -30
- {evalscope-0.17.0 → evalscope-0.17.1}/README.md +38 -26
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/data_adapter.py +9 -4
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
- evalscope-0.17.1/evalscope/benchmarks/hle/hle_adapter.py +118 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope-0.17.1/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/utils.py +1 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/constants.py +5 -21
- evalscope-0.17.1/evalscope/evaluator/__init__.py +3 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/evaluator/evaluator.py +5 -3
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/__init__.py +3 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/completion_parsers.py +7 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/llm_judge.py +6 -5
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/metrics.py +19 -7
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/perf/utils → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/__init__.py +4 -8
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/__init__.py +4 -9
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/base_adapter.py +4 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/bfcl_adapter.py +2 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/chat_adapter.py +3 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/choice_adapter.py +4 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/custom_adapter.py +7 -3
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/server_adapter.py +2 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope-0.17.1/evalscope/models/adapters/tau_bench_adapter.py +189 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/register.py +0 -14
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/arguments.py +13 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/benchmark.py +38 -39
- evalscope-0.17.1/evalscope/perf/http_client.py +120 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/main.py +2 -2
- evalscope-0.17.1/evalscope/perf/plugin/__init__.py +3 -0
- evalscope-0.17.1/evalscope/perf/plugin/api/__init__.py +4 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/api/base.py +22 -4
- evalscope-0.17.1/evalscope/perf/plugin/api/custom_api.py +249 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope-0.17.1/evalscope/perf/plugin/api/default_api.py +105 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope-0.17.1/evalscope/perf/plugin/datasets/__init__.py +10 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/base.py +22 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/custom.py +2 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope-0.17.1/evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/openqa.py +2 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope-0.17.1/evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope-0.17.1/evalscope/perf/plugin/registry.py +74 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/benchmark_util.py +14 -20
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/db_util.py +79 -61
- evalscope-0.17.1/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/io_utils.py +10 -0
- evalscope-0.17.1/evalscope/version.py +4 -0
- {evalscope-0.17.0 → evalscope-0.17.1/evalscope.egg-info}/PKG-INFO +44 -30
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/SOURCES.txt +8 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/requires.txt +12 -4
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/app.txt +1 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/dev.txt +1 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/setup.cfg +1 -1
- {evalscope-0.17.0 → evalscope-0.17.1}/setup.py +33 -15
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_all.py +18 -2
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_run.py +25 -37
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/perf/test_perf.py +29 -2
- evalscope-0.17.1/tests/rag/__init__.py +0 -0
- evalscope-0.17.0/evalscope/evaluator/__init__.py +0 -3
- evalscope-0.17.0/evalscope/models/model.py +0 -189
- evalscope-0.17.0/evalscope/perf/http_client.py +0 -176
- evalscope-0.17.0/evalscope/perf/plugin/__init__.py +0 -2
- evalscope-0.17.0/evalscope/perf/plugin/api/__init__.py +0 -3
- evalscope-0.17.0/evalscope/perf/plugin/api/custom_api.py +0 -92
- evalscope-0.17.0/evalscope/perf/plugin/datasets/__init__.py +0 -7
- evalscope-0.17.0/evalscope/perf/plugin/registry.py +0 -54
- evalscope-0.17.0/evalscope/version.py +0 -4
- {evalscope-0.17.0 → evalscope-0.17.1}/LICENSE +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/MANIFEST.in +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/app.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/constants.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/app_ui.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/multi_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/sidebar.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/single_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/visualization.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/data_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/localization.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/text_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/visualization.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/docmath/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/docmath/docmath_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/docmath/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/drop/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/drop/drop_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/drop/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/filters.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/frames/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/frames/frames_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/frames/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_arena/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_arena/general_arena_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_arena/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_mcq/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/ifeval → evalscope-0.17.1/evalscope/benchmarks/hle}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/iquiz → evalscope-0.17.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/live_code_bench → evalscope-0.17.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/maritime_bench → evalscope-0.17.1/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/math_500 → evalscope-0.17.1/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/mmlu_pro → evalscope-0.17.1/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/mmlu_redux → evalscope-0.17.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/musr → evalscope-0.17.1/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/needle_haystack → evalscope-0.17.1/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/process_bench → evalscope-0.17.1/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/simple_qa → evalscope-0.17.1/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/super_gpqa → evalscope-0.17.1/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/tool_bench → evalscope-0.17.1/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.17.0/evalscope/benchmarks/winogrande → evalscope-0.17.1/evalscope/benchmarks/tau_bench}/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models → evalscope-0.17.1/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.17.1/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/winogrande/winogrande_adapter.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/cli.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/evaluator.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/schema.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/config.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.17.0/evalscope/perf → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.17.0/evalscope/third_party/thinkbench/tools → evalscope-0.17.1/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.17.0/tests/rag → evalscope-0.17.1/evalscope/perf/utils}/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/log_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/rich_display.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/combinator.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/generator.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/run.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/summarizer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/argument_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/deprecation_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/import_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/logger.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/aigc.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/docs.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/framework.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/opencompass.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/perf.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/rag.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements/vlmeval.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/requirements.txt +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/aigc/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/aigc/test_t2i.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_collection.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_custom.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/perf/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/test_run_all.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/utils.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/vlm/__init__.py +0 -0
- {evalscope-0.17.0 → evalscope-0.17.1}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.17.
|
|
3
|
+
Version: 0.17.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
7
7
|
Author-email: contact@modelscope.cn
|
|
8
|
+
License: Apache License 2.0
|
|
8
9
|
Keywords: python,llm,evaluation
|
|
9
10
|
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
-
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.9
|
|
17
18
|
Description-Content-Type: text/markdown
|
|
18
19
|
Provides-Extra: opencompass
|
|
19
20
|
Provides-Extra: vlmeval
|
|
@@ -22,6 +23,7 @@ Provides-Extra: perf
|
|
|
22
23
|
Provides-Extra: app
|
|
23
24
|
Provides-Extra: aigc
|
|
24
25
|
Provides-Extra: dev
|
|
26
|
+
Provides-Extra: docs
|
|
25
27
|
Provides-Extra: all
|
|
26
28
|
License-File: LICENSE
|
|
27
29
|
|
|
@@ -64,16 +66,17 @@ License-File: LICENSE
|
|
|
64
66
|
- [Basic Parameter](#basic-parameter)
|
|
65
67
|
- [Output Results](#output-results)
|
|
66
68
|
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
67
|
-
- [🌐 Evaluation of
|
|
69
|
+
- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
|
|
68
70
|
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
69
|
-
- [Parameter](#parameter)
|
|
70
|
-
- [Evaluation
|
|
71
|
+
- [Parameter Description](#parameter-description)
|
|
72
|
+
- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
|
|
71
73
|
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
72
74
|
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
73
|
-
- [
|
|
75
|
+
- [⚔️ Arena Mode](#️-arena-mode)
|
|
74
76
|
- [👷♂️ Contribution](#️-contribution)
|
|
77
|
+
- [📚 Citation](#-citation)
|
|
75
78
|
- [🔜 Roadmap](#-roadmap)
|
|
76
|
-
- [Star History](
|
|
79
|
+
- [⭐ Star History](#-star-history)
|
|
77
80
|
|
|
78
81
|
|
|
79
82
|
## 📝 Introduction
|
|
@@ -137,7 +140,9 @@ Please scan the QR code below to join our community groups:
|
|
|
137
140
|
|
|
138
141
|
|
|
139
142
|
## 🎉 News
|
|
140
|
-
|
|
143
|
+
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
144
|
+
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
145
|
+
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
141
146
|
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
142
147
|
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
143
148
|
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
@@ -149,6 +154,8 @@ Please scan the QR code below to join our community groups:
|
|
|
149
154
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
150
155
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
151
156
|
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
157
|
+
<details><summary>More</summary>
|
|
158
|
+
|
|
152
159
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
153
160
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
154
161
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -158,8 +165,6 @@ Please scan the QR code below to join our community groups:
|
|
|
158
165
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
159
166
|
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
160
167
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
161
|
-
<details><summary>More</summary>
|
|
162
|
-
|
|
163
168
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
164
169
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
165
170
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
@@ -255,33 +260,31 @@ evalscope eval \
|
|
|
255
260
|
|
|
256
261
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
257
262
|
|
|
258
|
-
**Using
|
|
263
|
+
**Using `TaskConfig`**
|
|
259
264
|
|
|
260
265
|
```python
|
|
261
|
-
from evalscope
|
|
266
|
+
from evalscope import run_task, TaskConfig
|
|
262
267
|
|
|
263
|
-
task_cfg =
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
+
task_cfg = TaskConfig(
|
|
269
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
270
|
+
datasets=['gsm8k', 'arc'],
|
|
271
|
+
limit=5
|
|
272
|
+
)
|
|
268
273
|
|
|
269
274
|
run_task(task_cfg=task_cfg)
|
|
270
275
|
```
|
|
271
|
-
|
|
272
276
|
<details><summary>More Startup Methods</summary>
|
|
273
277
|
|
|
274
|
-
**Using
|
|
278
|
+
**Using Python Dictionary**
|
|
275
279
|
|
|
276
280
|
```python
|
|
277
281
|
from evalscope.run import run_task
|
|
278
|
-
from evalscope.config import TaskConfig
|
|
279
282
|
|
|
280
|
-
task_cfg =
|
|
281
|
-
model
|
|
282
|
-
datasets
|
|
283
|
-
limit
|
|
284
|
-
|
|
283
|
+
task_cfg = {
|
|
284
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
285
|
+
'datasets': ['gsm8k', 'arc'],
|
|
286
|
+
'limit': 5
|
|
287
|
+
}
|
|
285
288
|
|
|
286
289
|
run_task(task_cfg=task_cfg)
|
|
287
290
|
```
|
|
@@ -384,7 +387,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
384
387
|
|
|
385
388
|
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
386
389
|
|
|
387
|
-
## 🌐 Evaluation of
|
|
390
|
+
## 🌐 Evaluation of Model API
|
|
388
391
|
|
|
389
392
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
390
393
|
|
|
@@ -435,7 +438,7 @@ evalscope eval \
|
|
|
435
438
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
436
439
|
|
|
437
440
|
|
|
438
|
-
## Evaluation
|
|
441
|
+
## 🧪 Other Evaluation Backends
|
|
439
442
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
440
443
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
441
444
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -508,6 +511,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
508
511
|
</table>
|
|
509
512
|
</a>
|
|
510
513
|
|
|
514
|
+
## 📚 Citation
|
|
515
|
+
|
|
516
|
+
```bibtex
|
|
517
|
+
@misc{evalscope_2024,
|
|
518
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
519
|
+
author={ModelScope Team},
|
|
520
|
+
year={2024},
|
|
521
|
+
url={https://github.com/modelscope/evalscope}
|
|
522
|
+
}
|
|
523
|
+
```
|
|
524
|
+
|
|
511
525
|
## 🔜 Roadmap
|
|
512
526
|
- [x] Support for better evaluation report visualization
|
|
513
527
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -523,6 +537,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
523
537
|
- [x] MBPP
|
|
524
538
|
|
|
525
539
|
|
|
526
|
-
## Star History
|
|
540
|
+
## ⭐ Star History
|
|
527
541
|
|
|
528
542
|
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -37,16 +37,17 @@
|
|
|
37
37
|
- [Basic Parameter](#basic-parameter)
|
|
38
38
|
- [Output Results](#output-results)
|
|
39
39
|
- [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
|
|
40
|
-
- [🌐 Evaluation of
|
|
40
|
+
- [🌐 Evaluation of Model API](#-evaluation-of-model-api)
|
|
41
41
|
- [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
|
|
42
|
-
- [Parameter](#parameter)
|
|
43
|
-
- [Evaluation
|
|
42
|
+
- [Parameter Description](#parameter-description)
|
|
43
|
+
- [🧪 Other Evaluation Backends](#-other-evaluation-backends)
|
|
44
44
|
- [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
45
45
|
- [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
46
|
-
- [
|
|
46
|
+
- [⚔️ Arena Mode](#️-arena-mode)
|
|
47
47
|
- [👷♂️ Contribution](#️-contribution)
|
|
48
|
+
- [📚 Citation](#-citation)
|
|
48
49
|
- [🔜 Roadmap](#-roadmap)
|
|
49
|
-
- [Star History](
|
|
50
|
+
- [⭐ Star History](#-star-history)
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
## 📝 Introduction
|
|
@@ -110,7 +111,9 @@ Please scan the QR code below to join our community groups:
|
|
|
110
111
|
|
|
111
112
|
|
|
112
113
|
## 🎉 News
|
|
113
|
-
|
|
114
|
+
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
115
|
+
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
116
|
+
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
114
117
|
- 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
|
|
115
118
|
- 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
|
|
116
119
|
- 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
|
|
@@ -122,6 +125,8 @@ Please scan the QR code below to join our community groups:
|
|
|
122
125
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
123
126
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
124
127
|
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
128
|
+
<details><summary>More</summary>
|
|
129
|
+
|
|
125
130
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
126
131
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
127
132
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -131,8 +136,6 @@ Please scan the QR code below to join our community groups:
|
|
|
131
136
|
- 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
|
|
132
137
|
- 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
|
|
133
138
|
- 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
|
|
134
|
-
<details><summary>More</summary>
|
|
135
|
-
|
|
136
139
|
- 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
|
|
137
140
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
138
141
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
@@ -228,33 +231,31 @@ evalscope eval \
|
|
|
228
231
|
|
|
229
232
|
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
230
233
|
|
|
231
|
-
**Using
|
|
234
|
+
**Using `TaskConfig`**
|
|
232
235
|
|
|
233
236
|
```python
|
|
234
|
-
from evalscope
|
|
237
|
+
from evalscope import run_task, TaskConfig
|
|
235
238
|
|
|
236
|
-
task_cfg =
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
239
|
+
task_cfg = TaskConfig(
|
|
240
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
241
|
+
datasets=['gsm8k', 'arc'],
|
|
242
|
+
limit=5
|
|
243
|
+
)
|
|
241
244
|
|
|
242
245
|
run_task(task_cfg=task_cfg)
|
|
243
246
|
```
|
|
244
|
-
|
|
245
247
|
<details><summary>More Startup Methods</summary>
|
|
246
248
|
|
|
247
|
-
**Using
|
|
249
|
+
**Using Python Dictionary**
|
|
248
250
|
|
|
249
251
|
```python
|
|
250
252
|
from evalscope.run import run_task
|
|
251
|
-
from evalscope.config import TaskConfig
|
|
252
253
|
|
|
253
|
-
task_cfg =
|
|
254
|
-
model
|
|
255
|
-
datasets
|
|
256
|
-
limit
|
|
257
|
-
|
|
254
|
+
task_cfg = {
|
|
255
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
256
|
+
'datasets': ['gsm8k', 'arc'],
|
|
257
|
+
'limit': 5
|
|
258
|
+
}
|
|
258
259
|
|
|
259
260
|
run_task(task_cfg=task_cfg)
|
|
260
261
|
```
|
|
@@ -357,7 +358,7 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
357
358
|
|
|
358
359
|
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
359
360
|
|
|
360
|
-
## 🌐 Evaluation of
|
|
361
|
+
## 🌐 Evaluation of Model API
|
|
361
362
|
|
|
362
363
|
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
363
364
|
|
|
@@ -408,7 +409,7 @@ evalscope eval \
|
|
|
408
409
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
409
410
|
|
|
410
411
|
|
|
411
|
-
## Evaluation
|
|
412
|
+
## 🧪 Other Evaluation Backends
|
|
412
413
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
413
414
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
414
415
|
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
@@ -481,6 +482,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
481
482
|
</table>
|
|
482
483
|
</a>
|
|
483
484
|
|
|
485
|
+
## 📚 Citation
|
|
486
|
+
|
|
487
|
+
```bibtex
|
|
488
|
+
@misc{evalscope_2024,
|
|
489
|
+
title={{EvalScope}: Evaluation Framework for Large Models},
|
|
490
|
+
author={ModelScope Team},
|
|
491
|
+
year={2024},
|
|
492
|
+
url={https://github.com/modelscope/evalscope}
|
|
493
|
+
}
|
|
494
|
+
```
|
|
495
|
+
|
|
484
496
|
## 🔜 Roadmap
|
|
485
497
|
- [x] Support for better evaluation report visualization
|
|
486
498
|
- [x] Support for mixed evaluations across multiple datasets
|
|
@@ -496,6 +508,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
496
508
|
- [x] MBPP
|
|
497
509
|
|
|
498
510
|
|
|
499
|
-
## Star History
|
|
511
|
+
## ⭐ Star History
|
|
500
512
|
|
|
501
513
|
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -35,7 +35,7 @@ SUBJECT_MAPPING = {
|
|
|
35
35
|
@Benchmark.register(
|
|
36
36
|
name='bfcl_v3',
|
|
37
37
|
pretty_name='BFCL-v3',
|
|
38
|
-
tags=['Agent'],
|
|
38
|
+
tags=['Agent', 'Function Calling'],
|
|
39
39
|
description=
|
|
40
40
|
'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
|
|
41
41
|
'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
|
|
@@ -168,6 +168,11 @@ class DataAdapter(ABC):
|
|
|
168
168
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
169
169
|
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
170
170
|
"""
|
|
171
|
+
# remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
|
|
172
|
+
dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
|
|
173
|
+
if os.path.exists(dataset_infos_path):
|
|
174
|
+
logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
|
|
175
|
+
os.remove(dataset_infos_path)
|
|
171
176
|
return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
|
|
172
177
|
|
|
173
178
|
def load_with_snapshot(self,
|
|
@@ -382,7 +387,7 @@ class DataAdapter(ABC):
|
|
|
382
387
|
pass
|
|
383
388
|
|
|
384
389
|
def gen_prompt_data(self,
|
|
385
|
-
prompt: str,
|
|
390
|
+
prompt: str = '',
|
|
386
391
|
system_prompt: Optional[str] = None,
|
|
387
392
|
choices: Optional[List[str]] = None,
|
|
388
393
|
index: Optional[Union[int, str]] = None,
|
|
@@ -413,7 +418,8 @@ class DataAdapter(ABC):
|
|
|
413
418
|
system_prompt=system_prompt or self.system_prompt,
|
|
414
419
|
index=index or 0,
|
|
415
420
|
id=id,
|
|
416
|
-
messages=messages
|
|
421
|
+
messages=messages,
|
|
422
|
+
extra_data=kwargs.get('extra_data', None))
|
|
417
423
|
return prompt_data.to_dict()
|
|
418
424
|
|
|
419
425
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -477,7 +483,6 @@ class DataAdapter(ABC):
|
|
|
477
483
|
"""
|
|
478
484
|
return result
|
|
479
485
|
|
|
480
|
-
@abstractmethod
|
|
481
486
|
def match(self, gold: Any, pred: Any) -> Any:
|
|
482
487
|
"""
|
|
483
488
|
Match the gold answer and the predicted answer.
|
|
@@ -491,7 +496,7 @@ class DataAdapter(ABC):
|
|
|
491
496
|
Returns:
|
|
492
497
|
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
493
498
|
"""
|
|
494
|
-
|
|
499
|
+
return 1.0 if gold == pred else 0.0
|
|
495
500
|
|
|
496
501
|
def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
|
|
497
502
|
"""
|
{evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py
RENAMED
|
@@ -17,7 +17,8 @@ logger = get_logger()
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='general_mcq',
|
|
19
19
|
pretty_name='General-MCQ',
|
|
20
|
-
description='A general multiple-choice question answering dataset.'
|
|
20
|
+
description='A general multiple-choice question answering dataset for custom evaluation. '
|
|
21
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
|
|
21
22
|
tags=['MCQ', 'Custom'],
|
|
22
23
|
dataset_id='general_mcq',
|
|
23
24
|
model_adapter=OutputType.GENERATION,
|
|
@@ -14,7 +14,8 @@ logger = get_logger()
|
|
|
14
14
|
@Benchmark.register(
|
|
15
15
|
name='general_qa',
|
|
16
16
|
pretty_name='General-QA',
|
|
17
|
-
description='
|
|
17
|
+
description='A general question answering dataset for custom evaluation. '
|
|
18
|
+
'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
|
|
18
19
|
tags=['QA', 'Custom'],
|
|
19
20
|
dataset_id='general_qa',
|
|
20
21
|
subset_list=['default'],
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from typing import Any, List
|
|
4
|
+
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
# flake8: noqa
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
SUBSET_LIST = [
|
|
14
|
+
'Biology/Medicine',
|
|
15
|
+
'Chemistry',
|
|
16
|
+
'Computer Science/AI',
|
|
17
|
+
'Engineering',
|
|
18
|
+
'Humanities/Social Science',
|
|
19
|
+
'Math',
|
|
20
|
+
'Physics',
|
|
21
|
+
'Other',
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@Benchmark.register(
|
|
26
|
+
name='hle',
|
|
27
|
+
pretty_name="Humanity's-Last-Exam",
|
|
28
|
+
tags=['Knowledge', 'QA'],
|
|
29
|
+
description=
|
|
30
|
+
'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.', # noqa: E501
|
|
31
|
+
dataset_id='cais/hle',
|
|
32
|
+
subset_list=SUBSET_LIST,
|
|
33
|
+
metric_list=['AverageAccuracy'],
|
|
34
|
+
few_shot_num=0,
|
|
35
|
+
train_split=None,
|
|
36
|
+
eval_split='test',
|
|
37
|
+
prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
38
|
+
)
|
|
39
|
+
class HLEAdapter(DataAdapter):
|
|
40
|
+
|
|
41
|
+
def __init__(self, *args, **kwargs):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
self.llm_as_a_judge = True
|
|
45
|
+
|
|
46
|
+
def load(self, **kwargs):
|
|
47
|
+
kwargs['subset_list'] = ['default']
|
|
48
|
+
data_dict = super().load(**kwargs)
|
|
49
|
+
return self.reformat_subset(data_dict, subset_key='category', format='{}')
|
|
50
|
+
|
|
51
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
52
|
+
# remove image preview
|
|
53
|
+
input_d.pop('image_preview', None)
|
|
54
|
+
input_d.pop('rationale_image', None)
|
|
55
|
+
# generate prompt
|
|
56
|
+
question = input_d['question']
|
|
57
|
+
prompt = self.prompt_template.format(query=question)
|
|
58
|
+
image = input_d.get('image', None)
|
|
59
|
+
# build messages for multi-modal input
|
|
60
|
+
messages = []
|
|
61
|
+
if self.system_prompt:
|
|
62
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
63
|
+
if image:
|
|
64
|
+
messages.append({
|
|
65
|
+
'role':
|
|
66
|
+
'user',
|
|
67
|
+
'content': [{
|
|
68
|
+
'type': 'text',
|
|
69
|
+
'text': prompt
|
|
70
|
+
}, {
|
|
71
|
+
'type': 'image_url',
|
|
72
|
+
'image_url': {
|
|
73
|
+
'url': image
|
|
74
|
+
}
|
|
75
|
+
}]
|
|
76
|
+
})
|
|
77
|
+
else:
|
|
78
|
+
messages.append({'role': 'user', 'content': prompt})
|
|
79
|
+
return self.gen_prompt_data(prompt='', messages=messages)
|
|
80
|
+
|
|
81
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
82
|
+
return input_d['answer']
|
|
83
|
+
|
|
84
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
|
|
85
|
+
# Extract the answer from the model output \boxed{answer}
|
|
86
|
+
match = re.search(r'\\boxed{([^}]*)}', result)
|
|
87
|
+
if match:
|
|
88
|
+
return match.group(1).strip()
|
|
89
|
+
else:
|
|
90
|
+
logger.warning(f'No answer found in the model output: {result}')
|
|
91
|
+
return ''
|
|
92
|
+
|
|
93
|
+
def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
|
|
94
|
+
return result.strip()
|
|
95
|
+
|
|
96
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
97
|
+
# simple match
|
|
98
|
+
return {
|
|
99
|
+
'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
|
|
103
|
+
raw_input = kwargs.get('raw_input', None)
|
|
104
|
+
question = raw_input['question']
|
|
105
|
+
# get grading response
|
|
106
|
+
prompt = judge.build_prompt(pred, gold, question)
|
|
107
|
+
judge_response = judge(prompt)
|
|
108
|
+
score = judge.get_score(judge_response)
|
|
109
|
+
return {
|
|
110
|
+
'AverageAccuracy': score,
|
|
111
|
+
'response': judge_response,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
115
|
+
# zip dict answers
|
|
116
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
117
|
+
|
|
118
|
+
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -22,7 +22,8 @@ logger = get_logger()
|
|
|
22
22
|
few_shot_num=0,
|
|
23
23
|
train_split=None,
|
|
24
24
|
eval_split='test',
|
|
25
|
-
prompt_template=
|
|
25
|
+
prompt_template=
|
|
26
|
+
'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
|
|
26
27
|
extra_params={
|
|
27
28
|
'num_workers': 4,
|
|
28
29
|
'timeout': 4
|
|
@@ -76,26 +77,9 @@ class HumanevalAdapter(DataAdapter):
|
|
|
76
77
|
|
|
77
78
|
@classmethod
|
|
78
79
|
def _postprocess(cls, text: str) -> str:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
text = text.split('```')[1] # fall back to default strategy
|
|
83
|
-
else:
|
|
84
|
-
text = blocks[0] # fetch the first code block
|
|
85
|
-
if not text.startswith('\n'): # in case starting with ```python
|
|
86
|
-
text = text[max(text.find('\n') + 1, 0):]
|
|
87
|
-
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
88
|
-
def_idx = text.find('def')
|
|
89
|
-
if def_idx != -1:
|
|
90
|
-
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
91
|
-
text = text.split('\n\n')[0]
|
|
92
|
-
if text.strip().startswith('def'):
|
|
93
|
-
text = '\n'.join(text.split('\n')[1:])
|
|
94
|
-
if not text.startswith(' '):
|
|
95
|
-
if text.startswith(' '):
|
|
96
|
-
text = ' ' + text.lstrip()
|
|
97
|
-
else:
|
|
98
|
-
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
80
|
+
blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
|
|
81
|
+
if len(blocks) >= 1:
|
|
82
|
+
text = blocks[0]
|
|
99
83
|
return text
|
|
100
84
|
|
|
101
85
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
@@ -144,7 +144,7 @@ SUBJECT_MAPPING = {
|
|
|
144
144
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
145
145
|
subset_list=SUBSET_LIST,
|
|
146
146
|
metric_list=['AverageAccuracy'],
|
|
147
|
-
few_shot_num=
|
|
147
|
+
few_shot_num=0,
|
|
148
148
|
train_split='train',
|
|
149
149
|
eval_split='test',
|
|
150
150
|
prompt_template=
|