evalscope 0.15.1__tar.gz → 0.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.15.1/evalscope.egg-info → evalscope-0.16.0}/PKG-INFO +45 -21
- {evalscope-0.15.1 → evalscope-0.16.0}/README.md +40 -18
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/arguments.py +10 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/utils/llm.py +1 -1
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/data_adapter.py +4 -2
- evalscope-0.16.0/evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope-0.16.0/evalscope/benchmarks/drop/utils.py +59 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope-0.16.0/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +67 -0
- evalscope-0.16.0/evalscope/benchmarks/tool_bench/utils.py +202 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/utils.py +3 -2
- evalscope-0.16.0/evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/collections/evaluator.py +76 -26
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/config.py +46 -15
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/evaluator/evaluator.py +43 -15
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/llm_judge.py +3 -3
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/rouge_metric.py +11 -13
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/third_party/thinkbench/tools → evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/adapters/chat_adapter.py +51 -34
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/adapters/server_adapter.py +15 -19
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/arguments.py +14 -5
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/benchmark.py +0 -6
- evalscope-0.16.0/evalscope/perf/main.py +96 -0
- evalscope-0.16.0/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/utils/benchmark_util.py +33 -15
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/utils/db_util.py +25 -15
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/utils/log_utils.py +1 -1
- evalscope-0.16.0/evalscope/perf/utils/rich_display.py +186 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/report/app.py +47 -34
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/report/utils.py +1 -1
- evalscope-0.16.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope-0.16.0/evalscope/utils/deprecation_utils.py +42 -0
- evalscope-0.16.0/evalscope/version.py +4 -0
- {evalscope-0.15.1 → evalscope-0.16.0/evalscope.egg-info}/PKG-INFO +45 -21
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope.egg-info/SOURCES.txt +10 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope.egg-info/requires.txt +4 -2
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/perf.txt +2 -1
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/cli/test_all.py +3 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/cli/test_collection.py +2 -1
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/cli/test_run.py +28 -12
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/perf/test_perf.py +23 -0
- evalscope-0.16.0/tests/rag/__init__.py +0 -0
- evalscope-0.15.1/evalscope/perf/main.py +0 -46
- evalscope-0.15.1/evalscope/version.py +0 -4
- {evalscope-0.15.1 → evalscope-0.16.0}/LICENSE +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/MANIFEST.in +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aime/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/arena_hard/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/benchmark.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/data_collection/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/general_mcq → evalscope-0.16.0/evalscope/benchmarks/drop}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/gpqa → evalscope-0.16.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/ifeval → evalscope-0.16.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/iquiz → evalscope-0.16.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/live_code_bench → evalscope-0.16.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/maritime_bench → evalscope-0.16.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/math_500 → evalscope-0.16.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/mmlu_pro → evalscope-0.16.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/mmlu_redux → evalscope-0.16.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/musr → evalscope-0.16.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/process_bench → evalscope-0.16.0/evalscope/benchmarks/musr}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/simple_qa → evalscope-0.16.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/race/race_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.15.1/evalscope/benchmarks/super_gpqa → evalscope-0.16.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models → evalscope-0.16.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.16.0/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.16.0/evalscope/benchmarks/winogrande}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/cli/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/cli/cli.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/collections/sampler.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/collections/schema.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/constants.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/math_parser.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/metrics.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/named_metrics.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/constants.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
- {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.16.0/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
- {evalscope-0.15.1/evalscope/perf → evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
- {evalscope-0.15.1/evalscope/perf/utils → evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/score.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/adapters/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/adapters/base_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/adapters/choice_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/adapters/custom_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/adapters/t2i_adapter.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/local_model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/model.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/models/register.py +0 -0
- {evalscope-0.15.1/tests/rag → evalscope-0.16.0/evalscope/perf}/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/report/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/report/app_arguments.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/report/combinator.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/report/generator.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/run.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/run_arena.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/summarizer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/thinkbench/eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/thinkbench/infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/chat_service.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/filters.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/import_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/logger.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope/utils/utils.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/aigc.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/app.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/docs.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/framework.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/opencompass.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/rag.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements/vlmeval.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/requirements.txt +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/setup.cfg +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/setup.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/aigc/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/aigc/test_t2i.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/cli/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/perf/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/swift/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/test_run_all.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/vlm/__init__.py +0 -0
- {evalscope-0.15.1 → evalscope-0.16.0}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.16.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -60,9 +60,10 @@ Provides-Extra: perf
|
|
|
60
60
|
Requires-Dist: aiohttp; extra == "perf"
|
|
61
61
|
Requires-Dist: fastapi; extra == "perf"
|
|
62
62
|
Requires-Dist: numpy; extra == "perf"
|
|
63
|
+
Requires-Dist: rich; extra == "perf"
|
|
63
64
|
Requires-Dist: sse_starlette; extra == "perf"
|
|
64
65
|
Requires-Dist: transformers; extra == "perf"
|
|
65
|
-
Requires-Dist:
|
|
66
|
+
Requires-Dist: uvicorn; extra == "perf"
|
|
66
67
|
Provides-Extra: app
|
|
67
68
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
68
69
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
@@ -113,9 +114,10 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
|
113
114
|
Requires-Dist: aiohttp; extra == "all"
|
|
114
115
|
Requires-Dist: fastapi; extra == "all"
|
|
115
116
|
Requires-Dist: numpy; extra == "all"
|
|
117
|
+
Requires-Dist: rich; extra == "all"
|
|
116
118
|
Requires-Dist: sse_starlette; extra == "all"
|
|
117
119
|
Requires-Dist: transformers; extra == "all"
|
|
118
|
-
Requires-Dist:
|
|
120
|
+
Requires-Dist: uvicorn; extra == "all"
|
|
119
121
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
120
122
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
121
123
|
Requires-Dist: diffusers; extra == "all"
|
|
@@ -177,9 +179,23 @@ Requires-Dist: opencv-python; extra == "all"
|
|
|
177
179
|
|
|
178
180
|
## 📝 Introduction
|
|
179
181
|
|
|
180
|
-
EvalScope is [ModelScope](https://modelscope.cn/)
|
|
182
|
+
EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
|
|
181
183
|
|
|
182
|
-
|
|
184
|
+
- 🧠 Large Language Models
|
|
185
|
+
- 🎨 Multimodal Models
|
|
186
|
+
- 🔍 Embedding Models
|
|
187
|
+
- 🏆 Reranker Models
|
|
188
|
+
- 🖼️ CLIP Models
|
|
189
|
+
- 🎭 AIGC Models (Image-to-Text/Video)
|
|
190
|
+
- ...and more!
|
|
191
|
+
|
|
192
|
+
EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
|
|
193
|
+
|
|
194
|
+
- 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
|
|
195
|
+
- 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
|
|
196
|
+
- 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
|
|
197
|
+
|
|
198
|
+
Below is the overall architecture diagram of EvalScope:
|
|
183
199
|
|
|
184
200
|
<p align="center">
|
|
185
201
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
@@ -214,6 +230,8 @@ Please scan the QR code below to join our community groups:
|
|
|
214
230
|
|
|
215
231
|
## 🎉 News
|
|
216
232
|
|
|
233
|
+
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
234
|
+
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
217
235
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
218
236
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
219
237
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -479,26 +497,27 @@ For more customized evaluations, such as customizing model parameters or dataset
|
|
|
479
497
|
|
|
480
498
|
```shell
|
|
481
499
|
evalscope eval \
|
|
482
|
-
--model Qwen/
|
|
483
|
-
--model-args revision
|
|
484
|
-
--generation-config do_sample
|
|
500
|
+
--model Qwen/Qwen3-0.6B \
|
|
501
|
+
--model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
|
|
502
|
+
--generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
|
|
485
503
|
--dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
|
|
486
504
|
--datasets gsm8k \
|
|
487
505
|
--limit 10
|
|
488
506
|
```
|
|
489
507
|
|
|
490
|
-
### Parameter
|
|
491
|
-
- `--model-args`: Model loading parameters,
|
|
492
|
-
- `revision`: Model version
|
|
493
|
-
- `precision`: Model precision
|
|
494
|
-
- `device_map`:
|
|
495
|
-
- `--generation-config`: Generation parameters,
|
|
496
|
-
- `do_sample`: Whether to use sampling
|
|
497
|
-
- `
|
|
498
|
-
- `max_new_tokens`: Maximum length of
|
|
499
|
-
-
|
|
508
|
+
### Parameter Description
|
|
509
|
+
- `--model-args`: Model loading parameters, passed as a JSON string:
|
|
510
|
+
- `revision`: Model version
|
|
511
|
+
- `precision`: Model precision
|
|
512
|
+
- `device_map`: Device allocation for the model
|
|
513
|
+
- `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
|
|
514
|
+
- `do_sample`: Whether to use sampling
|
|
515
|
+
- `temperature`: Generation temperature
|
|
516
|
+
- `max_new_tokens`: Maximum length of generated tokens
|
|
517
|
+
- `chat_template_kwargs`: Model inference template parameters
|
|
518
|
+
- `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
|
|
500
519
|
- `few_shot_num`: Number of few-shot examples
|
|
501
|
-
- `few_shot_random`: Whether to randomly sample few-shot data
|
|
520
|
+
- `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
|
|
502
521
|
|
|
503
522
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
504
523
|
|
|
@@ -517,6 +536,11 @@ A stress testing tool focused on large language models, which can be customized
|
|
|
517
536
|
|
|
518
537
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
519
538
|
|
|
539
|
+
**Output example**
|
|
540
|
+
|
|
541
|
+

|
|
542
|
+
|
|
543
|
+
|
|
520
544
|
**Supports wandb for recording results**
|
|
521
545
|
|
|
522
546
|

|
|
@@ -565,7 +589,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
565
589
|
</a>
|
|
566
590
|
|
|
567
591
|
## 🔜 Roadmap
|
|
568
|
-
- [
|
|
592
|
+
- [x] Support for better evaluation report visualization
|
|
569
593
|
- [x] Support for mixed evaluations across multiple datasets
|
|
570
594
|
- [x] RAG evaluation
|
|
571
595
|
- [x] VLM evaluation
|
|
@@ -575,7 +599,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
575
599
|
- [x] Multi-modal evaluation
|
|
576
600
|
- [ ] Benchmarks
|
|
577
601
|
- [ ] GAIA
|
|
578
|
-
- [
|
|
602
|
+
- [x] GPQA
|
|
579
603
|
- [x] MBPP
|
|
580
604
|
|
|
581
605
|
|
|
@@ -51,9 +51,23 @@
|
|
|
51
51
|
|
|
52
52
|
## 📝 Introduction
|
|
53
53
|
|
|
54
|
-
EvalScope is [ModelScope](https://modelscope.cn/)
|
|
54
|
+
EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
- 🧠 Large Language Models
|
|
57
|
+
- 🎨 Multimodal Models
|
|
58
|
+
- 🔍 Embedding Models
|
|
59
|
+
- 🏆 Reranker Models
|
|
60
|
+
- 🖼️ CLIP Models
|
|
61
|
+
- 🎭 AIGC Models (Image-to-Text/Video)
|
|
62
|
+
- ...and more!
|
|
63
|
+
|
|
64
|
+
EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
|
|
65
|
+
|
|
66
|
+
- 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
|
|
67
|
+
- 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
|
|
68
|
+
- 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
|
|
69
|
+
|
|
70
|
+
Below is the overall architecture diagram of EvalScope:
|
|
57
71
|
|
|
58
72
|
<p align="center">
|
|
59
73
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
@@ -88,6 +102,8 @@ Please scan the QR code below to join our community groups:
|
|
|
88
102
|
|
|
89
103
|
## 🎉 News
|
|
90
104
|
|
|
105
|
+
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
106
|
+
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
91
107
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
92
108
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
93
109
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
@@ -353,26 +369,27 @@ For more customized evaluations, such as customizing model parameters or dataset
|
|
|
353
369
|
|
|
354
370
|
```shell
|
|
355
371
|
evalscope eval \
|
|
356
|
-
--model Qwen/
|
|
357
|
-
--model-args revision
|
|
358
|
-
--generation-config do_sample
|
|
372
|
+
--model Qwen/Qwen3-0.6B \
|
|
373
|
+
--model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
|
|
374
|
+
--generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
|
|
359
375
|
--dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
|
|
360
376
|
--datasets gsm8k \
|
|
361
377
|
--limit 10
|
|
362
378
|
```
|
|
363
379
|
|
|
364
|
-
### Parameter
|
|
365
|
-
- `--model-args`: Model loading parameters,
|
|
366
|
-
- `revision`: Model version
|
|
367
|
-
- `precision`: Model precision
|
|
368
|
-
- `device_map`:
|
|
369
|
-
- `--generation-config`: Generation parameters,
|
|
370
|
-
- `do_sample`: Whether to use sampling
|
|
371
|
-
- `
|
|
372
|
-
- `max_new_tokens`: Maximum length of
|
|
373
|
-
-
|
|
380
|
+
### Parameter Description
|
|
381
|
+
- `--model-args`: Model loading parameters, passed as a JSON string:
|
|
382
|
+
- `revision`: Model version
|
|
383
|
+
- `precision`: Model precision
|
|
384
|
+
- `device_map`: Device allocation for the model
|
|
385
|
+
- `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
|
|
386
|
+
- `do_sample`: Whether to use sampling
|
|
387
|
+
- `temperature`: Generation temperature
|
|
388
|
+
- `max_new_tokens`: Maximum length of generated tokens
|
|
389
|
+
- `chat_template_kwargs`: Model inference template parameters
|
|
390
|
+
- `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
|
|
374
391
|
- `few_shot_num`: Number of few-shot examples
|
|
375
|
-
- `few_shot_random`: Whether to randomly sample few-shot data
|
|
392
|
+
- `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
|
|
376
393
|
|
|
377
394
|
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
378
395
|
|
|
@@ -391,6 +408,11 @@ A stress testing tool focused on large language models, which can be customized
|
|
|
391
408
|
|
|
392
409
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
393
410
|
|
|
411
|
+
**Output example**
|
|
412
|
+
|
|
413
|
+

|
|
414
|
+
|
|
415
|
+
|
|
394
416
|
**Supports wandb for recording results**
|
|
395
417
|
|
|
396
418
|

|
|
@@ -439,7 +461,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
439
461
|
</a>
|
|
440
462
|
|
|
441
463
|
## 🔜 Roadmap
|
|
442
|
-
- [
|
|
464
|
+
- [x] Support for better evaluation report visualization
|
|
443
465
|
- [x] Support for mixed evaluations across multiple datasets
|
|
444
466
|
- [x] RAG evaluation
|
|
445
467
|
- [x] VLM evaluation
|
|
@@ -449,7 +471,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
|
|
|
449
471
|
- [x] Multi-modal evaluation
|
|
450
472
|
- [ ] Benchmarks
|
|
451
473
|
- [ ] GAIA
|
|
452
|
-
- [
|
|
474
|
+
- [x] GPQA
|
|
453
475
|
- [x] MBPP
|
|
454
476
|
|
|
455
477
|
|
|
@@ -9,6 +9,15 @@ class ParseStrArgsAction(argparse.Action):
|
|
|
9
9
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
10
10
|
assert isinstance(values, str), 'args should be a string.'
|
|
11
11
|
|
|
12
|
+
# try json load first
|
|
13
|
+
try:
|
|
14
|
+
arg_dict = json.loads(values)
|
|
15
|
+
setattr(namespace, self.dest, arg_dict)
|
|
16
|
+
return
|
|
17
|
+
except (json.JSONDecodeError, ValueError):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
# If JSON load fails, fall back to parsing as key=value pairs
|
|
12
21
|
arg_dict = {}
|
|
13
22
|
for arg in values.strip().split(','):
|
|
14
23
|
key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
|
|
@@ -67,6 +76,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
67
76
|
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
|
|
68
77
|
|
|
69
78
|
# Debug and runtime mode arguments
|
|
79
|
+
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
|
|
70
80
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
71
81
|
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
72
82
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
@@ -52,7 +52,7 @@ class LocalLLM(BaseLLM):
|
|
|
52
52
|
"""Run the LLM on the given input."""
|
|
53
53
|
infer_cfg = {'stop': stop}
|
|
54
54
|
|
|
55
|
-
response, _ = self.model.
|
|
55
|
+
response, _ = self.model.predict([{'data': [prompt]}], infer_cfg=infer_cfg)
|
|
56
56
|
return response[0][0]
|
|
57
57
|
|
|
58
58
|
@property
|
{evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py
RENAMED
|
@@ -96,12 +96,6 @@ class AlpacaEvalAdapter(DataAdapter):
|
|
|
96
96
|
return None
|
|
97
97
|
|
|
98
98
|
def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
|
|
99
|
-
"""
|
|
100
|
-
compute weighted mean of the bleu score of all samples
|
|
101
|
-
|
|
102
|
-
Args:
|
|
103
|
-
review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
|
|
104
|
-
"""
|
|
105
99
|
# zip dict answers
|
|
106
100
|
res_list = [res for res in review_res_list if res is not None]
|
|
107
101
|
|
{evalscope-0.15.1 → evalscope-0.16.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py
RENAMED
|
@@ -148,6 +148,7 @@ class ChineseSimpleQAAdapter(DataAdapter):
|
|
|
148
148
|
'is_correct': 1 if res == 'A' else 0,
|
|
149
149
|
'is_incorrect': 1 if res == 'B' else 0,
|
|
150
150
|
'is_not_attempted': 1 if res == 'C' else 0,
|
|
151
|
+
'judge_response': grading_response,
|
|
151
152
|
}
|
|
152
153
|
|
|
153
154
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
@@ -258,7 +258,7 @@ class DataAdapter(ABC):
|
|
|
258
258
|
avg_res: Dict[str, List[float]]
|
|
259
259
|
|
|
260
260
|
"""
|
|
261
|
-
if isinstance(review_res_list[0], list):
|
|
261
|
+
if len(review_res_list) > 0 and isinstance(review_res_list[0], list):
|
|
262
262
|
review_res_list = [item for sublist in review_res_list for item in sublist]
|
|
263
263
|
|
|
264
264
|
items = defaultdict(list)
|
|
@@ -322,6 +322,7 @@ class DataAdapter(ABC):
|
|
|
322
322
|
choices: Optional[List[str]] = None,
|
|
323
323
|
index: Optional[Union[int, str]] = None,
|
|
324
324
|
id: Optional[Union[int, str]] = None,
|
|
325
|
+
messages: Optional[List[dict]] = None,
|
|
325
326
|
**kwargs) -> dict:
|
|
326
327
|
data = [prompt] if not isinstance(prompt, list) else prompt
|
|
327
328
|
prompt_data = PromptData(
|
|
@@ -329,7 +330,8 @@ class DataAdapter(ABC):
|
|
|
329
330
|
multi_choices=choices or self.choices,
|
|
330
331
|
system_prompt=system_prompt or self.system_prompt,
|
|
331
332
|
index=index or 0,
|
|
332
|
-
id=id
|
|
333
|
+
id=id,
|
|
334
|
+
messages=messages)
|
|
333
335
|
return prompt_data.to_dict()
|
|
334
336
|
|
|
335
337
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
DROP_EXAMPLES = '''Some examples of passages and Q&A are provided below.
|
|
11
|
+
|
|
12
|
+
# Examples
|
|
13
|
+
---
|
|
14
|
+
Passage: Trunajaya rebellion or Trunajaya War was the ultimately unsuccessful rebellion waged by the Madurese prince Trunajaya and fighters from Makassar against the Mataram Sultanate and its Dutch East India Company supporters in Java during the 1670s. The rebellion was initially successful: the rebels defeated the royal army at Gegodog , captured most of the Javanese north coast, and took the Mataram capital Plered . King Amangkurat I died during the retreat of the royal court. His son and successor, Amangkurat II, requested help from the VOC in exchange for financial remuneration and geopolitical concessions. The VOC\'s subsequent involvement turned the tide of the war. VOC and Mataram forces recovered lost territories and overran Trunajaya\'s new capital at Kediri . However, the rebellion continued until the capture of Trunajaya at the end of 1679, and the defeat, death, or surrender of the other rebel leaders . Trunajaya was killed by Amangkurat II personally in 1680 while a prisoner of the VOC. After his father\'s death in 1677, Amangkurat II also faced rival claims to the throne. The most serious rival was his brother Pangeran Puger, who took the capital Plered in 1677 and did not surrender until 1681.
|
|
15
|
+
Question: How many years was it between Trunajaya\'s capture and his death while prisoner of the VOC?
|
|
16
|
+
Answer: 1
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
Passage: Led by former Giant Kurt Warner, the defending NFC champions took the field at Giants Stadium against a Giants team still reeling from their bad loss in New Orleans. The Giants scored first, sending Jacobs in for a 4-yard touchdown run following a Terrell Thomas interception. Later, Arizona running back Beanie Wells scored his first career touchdown on a 13-yard rush. Manning responded by throwing a 62-yard touchdown to Nicks for his longest reception of the year. In the second half, the Cardinals\' Tim Hightower and Jason Wright scored touchdowns. But it was turnovers that decided this game; Manning\'s 3 interceptions were as many as he had thrown all season. The Giants scored only 3 points in the second half, ending the game on an interception to Antrel Rolle. The Giants notable streak of 38 consecutive starts by the same offensive line unit was ended here, as offensive tackle Kareem McKenzie missed the game with a groin injury. McKenzie returned the following week.
|
|
20
|
+
Question: Which player made the first score of the game?
|
|
21
|
+
Answer: Jacobs
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
Passage: Hoping to rebound from their road loss to the Bills, the Chargers flew to Wembley Stadium for the 2008 International Series game with the New Orleans Saints. In the first quarter, San Diego trailed early as kicker Taylor Mehlhaff got a 23-yard field goal. The \'Bolts would respond with kicker Nate Kaeding getting a 33-yard field goal. In the second quarter, New Orleans regained the lead as QB Drew Brees (a former Charger) completed a 12-yard TD pass to WR Devery Henderson (with a failed PAT) and RB Deuce McAllister getting a 1-yard TD run. San Diego answered as QB Philip Rivers completed a 12-yard TD pass to RB LaDainian Tomlinson, but the Saints replied with Brees completing a 30-yard TD pass to WR Lance Moore. The Chargers closed out the half with Rivers completing a 12-yard TD pass to TE Antonio Gates. In the third quarter, New Orleans increased its lead Brees completing a 1-yard TD pass to TE Mark Campbell, after a very controversial Pass interference call on cornerback Cletis Gordon put the Saints on the 1-yard line. The \'Bolts would answer with Kaeding getting a 24-yard field goal. In the fourth quarter, the Saints continued to build its lead as FB Mike Karney got a 1-yard TD run. San Diego tried to rally as Kaeding nailed a 31-yard field goal, Rivers completed a 14-yard TD pass to WR Vincent Jackson, and Brees giving the \'Bolts a safety via an incomplete pass thrown into the back of his own endzone. However, New Orleans\' defense stiffened for the win. With the loss, the Chargers went into their bye week at 3-5.
|
|
25
|
+
Question: How many total yards of touchdown passes did Drew Brees make?
|
|
26
|
+
Answer: 43
|
|
27
|
+
|
|
28
|
+
''' # noqa: E501
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@Benchmark.register(
|
|
32
|
+
name='drop',
|
|
33
|
+
pretty_name='DROP',
|
|
34
|
+
dataset_id='AI-ModelScope/DROP',
|
|
35
|
+
metric_list=['AverageAccuracy'],
|
|
36
|
+
few_shot_num=0,
|
|
37
|
+
train_split=None,
|
|
38
|
+
eval_split='validation',
|
|
39
|
+
prompt_template=
|
|
40
|
+
'You will be asked to read a passage and answer a question.{drop_examples}# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
|
|
41
|
+
)
|
|
42
|
+
class DROPAdapter(DataAdapter):
|
|
43
|
+
|
|
44
|
+
def __init__(self, **kwargs):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
|
|
47
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
48
|
+
if few_shot_num != 0:
|
|
49
|
+
self.few_shot_num = 3
|
|
50
|
+
logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
|
|
51
|
+
else:
|
|
52
|
+
self.few_shot_num = 0
|
|
53
|
+
|
|
54
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
55
|
+
"""
|
|
56
|
+
Generate model prompt from input data.
|
|
57
|
+
"""
|
|
58
|
+
drop_examples = '' if self.few_shot_num == 0 else DROP_EXAMPLES
|
|
59
|
+
query = f"Passage: {input_d['passage']}\nQuestion: {input_d['question']}"
|
|
60
|
+
prompt = self.prompt_template.format(
|
|
61
|
+
drop_examples=drop_examples,
|
|
62
|
+
query=query,
|
|
63
|
+
)
|
|
64
|
+
return self.gen_prompt_data(prompt)
|
|
65
|
+
|
|
66
|
+
def get_gold_answer(self, input_d: dict) -> List[str]:
|
|
67
|
+
"""
|
|
68
|
+
Parse the raw input labels (gold).
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def _flatten_validated_answers(validated_answers):
|
|
72
|
+
"""Flattens a dict of lists of validated answers.
|
|
73
|
+
{"number": ['1', '8'], ...}
|
|
74
|
+
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
|
|
75
|
+
"""
|
|
76
|
+
valid_answers = []
|
|
77
|
+
for i in range(len(validated_answers['number'])):
|
|
78
|
+
valid_answers.append({
|
|
79
|
+
'number': validated_answers['number'][i],
|
|
80
|
+
'date': validated_answers['date'][i],
|
|
81
|
+
'spans': validated_answers['spans'][i],
|
|
82
|
+
})
|
|
83
|
+
return valid_answers
|
|
84
|
+
|
|
85
|
+
answers = []
|
|
86
|
+
answers_set = set()
|
|
87
|
+
candidates = [input_d['answer']] + _flatten_validated_answers(input_d['validated_answers'])
|
|
88
|
+
for candidate in candidates:
|
|
89
|
+
answer = DROPAdapter.parse_answer(candidate)
|
|
90
|
+
if answer in answers_set:
|
|
91
|
+
continue
|
|
92
|
+
answers_set.add(answer)
|
|
93
|
+
answers.append(answer)
|
|
94
|
+
return answers
|
|
95
|
+
|
|
96
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Parse the predicted result and extract proper answer.
|
|
99
|
+
"""
|
|
100
|
+
match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', result)
|
|
101
|
+
extracted_answer = match.group(1) if match else result
|
|
102
|
+
return extracted_answer
|
|
103
|
+
|
|
104
|
+
def match(self, gold: List[str], pred: str) -> float:
|
|
105
|
+
"""
|
|
106
|
+
Match the gold answer and the predicted answer.
|
|
107
|
+
"""
|
|
108
|
+
from .utils import _answer_to_bags
|
|
109
|
+
|
|
110
|
+
max_em = 0
|
|
111
|
+
for gold_answer in gold:
|
|
112
|
+
# Convert the answers to bags of answers
|
|
113
|
+
predicted_bags = _answer_to_bags(pred)
|
|
114
|
+
gold_bags = _answer_to_bags(gold_answer)
|
|
115
|
+
|
|
116
|
+
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
|
|
117
|
+
exact_match = 1.0
|
|
118
|
+
else:
|
|
119
|
+
exact_match = 0.0
|
|
120
|
+
# Check if the answer is empty
|
|
121
|
+
if gold_answer[0].strip():
|
|
122
|
+
max_em = max(max_em, exact_match)
|
|
123
|
+
|
|
124
|
+
return max_em
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def parse_answer(answer):
|
|
128
|
+
# NOTE: Everything is returned as a tuple for uniformity and hashability.
|
|
129
|
+
if answer['number'] != '':
|
|
130
|
+
return (str(answer['number']), )
|
|
131
|
+
if answer['spans'] != []:
|
|
132
|
+
return tuple(answer['spans'])
|
|
133
|
+
return (' '.join([answer['date']['day'], answer['date']['month'], answer['date']['year']]).strip(), )
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import string
|
|
3
|
+
|
|
4
|
+
_ARTICLES = re.compile(r'\b(a|an|the)\b', re.UNICODE)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _answer_to_bags(answer):
|
|
8
|
+
if isinstance(answer, (list, tuple)):
|
|
9
|
+
raw_spans = answer
|
|
10
|
+
else:
|
|
11
|
+
raw_spans = [answer]
|
|
12
|
+
normalized_spans = []
|
|
13
|
+
token_bags = []
|
|
14
|
+
for raw_span in raw_spans:
|
|
15
|
+
normalized_span = _normalize(raw_span)
|
|
16
|
+
normalized_spans.append(normalized_span)
|
|
17
|
+
token_bags.append(set(normalized_span.split()))
|
|
18
|
+
return normalized_spans, token_bags
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _is_number(text):
|
|
22
|
+
try:
|
|
23
|
+
float(text)
|
|
24
|
+
return True
|
|
25
|
+
except ValueError:
|
|
26
|
+
return False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _remove_articles(text):
|
|
30
|
+
return _ARTICLES.sub(' ', text)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _white_space_fix(text):
|
|
34
|
+
return ' '.join(text.split())
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _remove_punc(text):
|
|
38
|
+
exclude = set(string.punctuation)
|
|
39
|
+
if not _is_number(text):
|
|
40
|
+
return ''.join(ch for ch in text if ch not in exclude)
|
|
41
|
+
else:
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _fix_number(text):
|
|
46
|
+
return str(float(text)) if _is_number(text) else text
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _tokenize(text):
|
|
50
|
+
return re.split(' |-', text)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _normalize(answer):
|
|
54
|
+
tokens = [
|
|
55
|
+
_white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) for token in _tokenize(answer)
|
|
56
|
+
]
|
|
57
|
+
tokens = [token for token in tokens if token.strip()]
|
|
58
|
+
normalized = ' '.join(tokens).strip()
|
|
59
|
+
return normalized
|
|
@@ -4,7 +4,7 @@ from collections import defaultdict
|
|
|
4
4
|
from typing import List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import mean
|
|
8
8
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
@@ -112,9 +112,13 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
112
112
|
"""
|
|
113
113
|
res = dict()
|
|
114
114
|
if 'AverageRouge' in self.metric_list:
|
|
115
|
+
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
116
|
+
|
|
115
117
|
rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
|
|
116
118
|
res.update(rouge_dict)
|
|
117
119
|
if 'AverageBLEU' in self.metric_list:
|
|
120
|
+
from evalscope.metrics import bleu_ngram_one_sample
|
|
121
|
+
|
|
118
122
|
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
119
123
|
res.update(bleu_dict)
|
|
120
124
|
return res
|
|
@@ -148,6 +148,7 @@ class SimpleQAAdapter(DataAdapter):
|
|
|
148
148
|
'is_correct': 1 if res == 'A' else 0,
|
|
149
149
|
'is_incorrect': 1 if res == 'B' else 0,
|
|
150
150
|
'is_not_attempted': 1 if res == 'C' else 0,
|
|
151
|
+
'judge_response': grading_response,
|
|
151
152
|
}
|
|
152
153
|
|
|
153
154
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
+
from evalscope.constants import EvalType, OutputType
|
|
5
|
+
from evalscope.metrics import Metric, mean, metric_registry
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@Benchmark.register(
|
|
9
|
+
name='tool_bench',
|
|
10
|
+
pretty_name='ToolBench-Static',
|
|
11
|
+
dataset_id='AI-ModelScope/ToolBench-Static',
|
|
12
|
+
subset_list=['in_domain', 'out_of_domain'],
|
|
13
|
+
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
|
|
14
|
+
few_shot_num=0,
|
|
15
|
+
train_split=None,
|
|
16
|
+
eval_split='test',
|
|
17
|
+
)
|
|
18
|
+
class ToolBenchAdapter(DataAdapter):
|
|
19
|
+
|
|
20
|
+
def __init__(self, **kwargs):
|
|
21
|
+
super().__init__(**kwargs)
|
|
22
|
+
|
|
23
|
+
metric_registry.register(Metric(name='Rouge-L', object=mean))
|
|
24
|
+
metric_registry.register(Metric(name='Act.EM', object=mean))
|
|
25
|
+
metric_registry.register(Metric(name='Plan.EM', object=mean))
|
|
26
|
+
metric_registry.register(Metric(name='F1', object=mean))
|
|
27
|
+
metric_registry.register(Metric(name='HalluRate', object=mean))
|
|
28
|
+
|
|
29
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
Generate model prompt from input data.
|
|
32
|
+
"""
|
|
33
|
+
messages = input_d['messages']
|
|
34
|
+
# use prepared messages
|
|
35
|
+
return self.gen_prompt_data(prompt='', messages=messages)
|
|
36
|
+
|
|
37
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Parse the raw input labels (gold).
|
|
40
|
+
"""
|
|
41
|
+
return input_d
|
|
42
|
+
|
|
43
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Parse the predicted result and extract proper answer.
|
|
46
|
+
"""
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
def match(self, gold: dict, pred: str) -> Dict:
|
|
50
|
+
"""
|
|
51
|
+
Match the gold answer and the predicted answer.
|
|
52
|
+
"""
|
|
53
|
+
from .utils import calculate_metrics
|
|
54
|
+
|
|
55
|
+
data = {
|
|
56
|
+
'target': gold['target'],
|
|
57
|
+
'predictions': pred,
|
|
58
|
+
'tools': gold['tools'],
|
|
59
|
+
}
|
|
60
|
+
metrics = calculate_metrics(data)
|
|
61
|
+
return metrics
|
|
62
|
+
|
|
63
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
|
|
64
|
+
# aggregate review results
|
|
65
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
66
|
+
|
|
67
|
+
return super().compute_metric(res_dict, **kwargs)
|