evalscope 0.9.0__tar.gz → 0.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.9.0/evalscope.egg-info → evalscope-0.10.1}/PKG-INFO +84 -7
- {evalscope-0.9.0 → evalscope-0.10.1}/README.md +78 -6
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/arguments.py +1 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/arc_adapter.py +3 -5
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/benchmark.py +1 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/data_adapter.py +69 -70
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope-0.10.1/evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope-0.10.1/evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope-0.10.1/evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope-0.10.1/evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope-0.10.1/evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope-0.10.1/evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope-0.10.1/evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope-0.10.1/evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/race_adapter.py +4 -73
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/cli.py +2 -0
- evalscope-0.10.1/evalscope/cli/start_app.py +30 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/collections/evaluator.py +82 -62
- evalscope-0.10.1/evalscope/collections/sampler.py +138 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/collections/schema.py +14 -10
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/constants.py +4 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/evaluator.py +22 -13
- evalscope-0.10.1/evalscope/metrics/__init__.py +4 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/metrics.py +11 -2
- evalscope-0.10.1/evalscope/metrics/named_metrics.py +17 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/chat_adapter.py +2 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/server_adapter.py +11 -4
- evalscope-0.10.1/evalscope/perf/__init__.py +1 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/main.py +0 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/custom_api.py +1 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/openai_api.py +1 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope-0.10.1/evalscope/perf/utils/__init__.py +0 -0
- evalscope-0.10.1/evalscope/report/__init__.py +5 -0
- evalscope-0.10.1/evalscope/report/app.py +693 -0
- evalscope-0.10.1/evalscope/report/combinator.py +73 -0
- evalscope-0.10.1/evalscope/report/generator.py +80 -0
- evalscope-0.10.1/evalscope/report/utils.py +133 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/run.py +16 -11
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/summarizer.py +1 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/chat_service.py +1 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/logger.py +1 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/model_utils.py +5 -2
- evalscope-0.10.1/evalscope/version.py +4 -0
- {evalscope-0.9.0 → evalscope-0.10.1/evalscope.egg-info}/PKG-INFO +84 -7
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/SOURCES.txt +20 -4
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/requires.txt +6 -0
- evalscope-0.10.1/requirements/app.txt +2 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/setup.py +2 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/cli/test_collection.py +11 -7
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/cli/test_run.py +13 -4
- evalscope-0.10.1/tests/rag/__init__.py +0 -0
- evalscope-0.9.0/evalscope/collections/sampler.py +0 -132
- evalscope-0.9.0/evalscope/metrics/__init__.py +0 -7
- evalscope-0.9.0/evalscope/tools/combine_reports.py +0 -133
- evalscope-0.9.0/evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- evalscope-0.9.0/evalscope/version.py +0 -4
- evalscope-0.9.0/tests/vlm/__init__.py +0 -1
- {evalscope-0.9.0 → evalscope-0.10.1}/LICENSE +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/MANIFEST.in +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/base.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.9.0/evalscope/benchmarks/mmlu_pro → evalscope-0.10.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.9.0/evalscope/perf → evalscope-0.10.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.9.0/evalscope/perf/utils → evalscope-0.10.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.9.0/tests/rag → evalscope-0.10.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/base.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/config.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/math_accuracy.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/base_adapter.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/choice_adapter.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/custom/custom_model.py +0 -0
- /evalscope-0.9.0/evalscope/tools/rewrite_eval_results.py → /evalscope-0.10.1/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/custom_adapter.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/local_model.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/model.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/arguments.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/benchmark.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/benchmark_util.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/run_arena.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/io_utils.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/utils.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/docs.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/framework.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/inner.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/opencompass.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/perf.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/rag.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/tests.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements/vlmeval.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/requirements.txt +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/setup.cfg +0 -0
- {evalscope-0.9.0/evalscope/tools → evalscope-0.10.1/tests}/__init__.py +0 -0
- {evalscope-0.9.0/tests → evalscope-0.10.1/tests/cli}/__init__.py +0 -0
- {evalscope-0.9.0/tests/cli → evalscope-0.10.1/tests/perf}/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/perf/test_perf.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.9.0/tests/perf → evalscope-0.10.1/tests/swift}/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/test_run_all.py +0 -0
- {evalscope-0.9.0/tests/swift → evalscope-0.10.1/tests/vlm}/__init__.py +0 -0
- {evalscope-0.9.0 → evalscope-0.10.1}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -63,6 +63,9 @@ Requires-Dist: numpy; extra == "perf"
|
|
|
63
63
|
Requires-Dist: sse_starlette; extra == "perf"
|
|
64
64
|
Requires-Dist: transformers; extra == "perf"
|
|
65
65
|
Requires-Dist: unicorn; extra == "perf"
|
|
66
|
+
Provides-Extra: app
|
|
67
|
+
Requires-Dist: gradio>=5.4.0; extra == "app"
|
|
68
|
+
Requires-Dist: plotly>=5.23.0; extra == "app"
|
|
66
69
|
Provides-Extra: inner
|
|
67
70
|
Requires-Dist: absl-py; extra == "inner"
|
|
68
71
|
Requires-Dist: accelerate; extra == "inner"
|
|
@@ -133,6 +136,8 @@ Requires-Dist: numpy; extra == "all"
|
|
|
133
136
|
Requires-Dist: sse_starlette; extra == "all"
|
|
134
137
|
Requires-Dist: transformers; extra == "all"
|
|
135
138
|
Requires-Dist: unicorn; extra == "all"
|
|
139
|
+
Requires-Dist: gradio>=5.4.0; extra == "all"
|
|
140
|
+
Requires-Dist: plotly>=5.23.0; extra == "all"
|
|
136
141
|
|
|
137
142
|
<p align="center">
|
|
138
143
|
<br>
|
|
@@ -210,6 +215,8 @@ Please scan the QR code below to join our community groups:
|
|
|
210
215
|
|
|
211
216
|
|
|
212
217
|
## 🎉 News
|
|
218
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
219
|
+
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
213
220
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
214
221
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
215
222
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -374,15 +381,85 @@ run_task(task_cfg="config.json")
|
|
|
374
381
|
- `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
|
|
375
382
|
|
|
376
383
|
### Output Results
|
|
384
|
+
```text
|
|
385
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
386
|
+
| Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
|
|
387
|
+
+=======================+================+=================+=================+===============+=======+=========+
|
|
388
|
+
| Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
|
|
389
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
390
|
+
| Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
|
|
391
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
392
|
+
| Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
|
|
393
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
## 📈 Visualization of Evaluation Results
|
|
397
|
+
|
|
398
|
+
1. Install the dependencies required for visualization, including gradio, plotly, etc.
|
|
399
|
+
```bash
|
|
400
|
+
pip install 'evalscope[app]'
|
|
377
401
|
```
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
402
|
+
|
|
403
|
+
2. Start the Visualization Service
|
|
404
|
+
|
|
405
|
+
Run the following command to start the visualization service.
|
|
406
|
+
```bash
|
|
407
|
+
evalscope app
|
|
408
|
+
```
|
|
409
|
+
You can access the visualization service in the browser if the following output appears.
|
|
410
|
+
```text
|
|
411
|
+
* Running on local URL: http://127.0.0.1:7861
|
|
412
|
+
|
|
413
|
+
To create a public link, set `share=True` in `launch()`.
|
|
383
414
|
```
|
|
384
415
|
|
|
385
|
-
|
|
416
|
+
<table>
|
|
417
|
+
<tr>
|
|
418
|
+
<td style="text-align: center;">
|
|
419
|
+
<img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
|
|
420
|
+
<p>Setting Interface</p>
|
|
421
|
+
</td>
|
|
422
|
+
<td style="text-align: center;">
|
|
423
|
+
<img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
424
|
+
<p>Model Comparison</p>
|
|
425
|
+
</td>
|
|
426
|
+
</tr>
|
|
427
|
+
<tr>
|
|
428
|
+
<td style="text-align: center;">
|
|
429
|
+
<img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
430
|
+
<p>Report Overview</p>
|
|
431
|
+
</td>
|
|
432
|
+
<td style="text-align: center;">
|
|
433
|
+
<img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
|
|
434
|
+
<p>Report Details</p>
|
|
435
|
+
</td>
|
|
436
|
+
</tr>
|
|
437
|
+
</table>
|
|
438
|
+
|
|
439
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
440
|
+
|
|
441
|
+
## 🌐 Evaluation of Specified Model API
|
|
442
|
+
|
|
443
|
+
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
444
|
+
|
|
445
|
+
For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
|
|
446
|
+
|
|
447
|
+
```shell
|
|
448
|
+
export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
|
|
449
|
+
```
|
|
450
|
+
Then, you can use the following command to evaluate the model API service:
|
|
451
|
+
```shell
|
|
452
|
+
evalscope eval \
|
|
453
|
+
--model qwen2.5 \
|
|
454
|
+
--api-url http://127.0.0.1:8801/v1/chat/completions \
|
|
455
|
+
--api-key EMPTY \
|
|
456
|
+
--eval-type service \
|
|
457
|
+
--datasets gsm8k \
|
|
458
|
+
--limit 10
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
## ⚙️ Custom Parameter Evaluation
|
|
462
|
+
|
|
386
463
|
For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
|
|
387
464
|
|
|
388
465
|
```shell
|
|
@@ -74,6 +74,8 @@ Please scan the QR code below to join our community groups:
|
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
## 🎉 News
|
|
77
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
78
|
+
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
77
79
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
78
80
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
79
81
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
@@ -238,15 +240,85 @@ run_task(task_cfg="config.json")
|
|
|
238
240
|
- `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
|
|
239
241
|
|
|
240
242
|
### Output Results
|
|
243
|
+
```text
|
|
244
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
245
|
+
| Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
|
|
246
|
+
+=======================+================+=================+=================+===============+=======+=========+
|
|
247
|
+
| Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
|
|
248
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
249
|
+
| Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
|
|
250
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
251
|
+
| Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
|
|
252
|
+
+-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## 📈 Visualization of Evaluation Results
|
|
256
|
+
|
|
257
|
+
1. Install the dependencies required for visualization, including gradio, plotly, etc.
|
|
258
|
+
```bash
|
|
259
|
+
pip install 'evalscope[app]'
|
|
241
260
|
```
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
261
|
+
|
|
262
|
+
2. Start the Visualization Service
|
|
263
|
+
|
|
264
|
+
Run the following command to start the visualization service.
|
|
265
|
+
```bash
|
|
266
|
+
evalscope app
|
|
267
|
+
```
|
|
268
|
+
You can access the visualization service in the browser if the following output appears.
|
|
269
|
+
```text
|
|
270
|
+
* Running on local URL: http://127.0.0.1:7861
|
|
271
|
+
|
|
272
|
+
To create a public link, set `share=True` in `launch()`.
|
|
247
273
|
```
|
|
248
274
|
|
|
249
|
-
|
|
275
|
+
<table>
|
|
276
|
+
<tr>
|
|
277
|
+
<td style="text-align: center;">
|
|
278
|
+
<img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
|
|
279
|
+
<p>Setting Interface</p>
|
|
280
|
+
</td>
|
|
281
|
+
<td style="text-align: center;">
|
|
282
|
+
<img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
283
|
+
<p>Model Comparison</p>
|
|
284
|
+
</td>
|
|
285
|
+
</tr>
|
|
286
|
+
<tr>
|
|
287
|
+
<td style="text-align: center;">
|
|
288
|
+
<img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
289
|
+
<p>Report Overview</p>
|
|
290
|
+
</td>
|
|
291
|
+
<td style="text-align: center;">
|
|
292
|
+
<img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
|
|
293
|
+
<p>Report Details</p>
|
|
294
|
+
</td>
|
|
295
|
+
</tr>
|
|
296
|
+
</table>
|
|
297
|
+
|
|
298
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
299
|
+
|
|
300
|
+
## 🌐 Evaluation of Specified Model API
|
|
301
|
+
|
|
302
|
+
Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
|
|
303
|
+
|
|
304
|
+
For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
|
|
305
|
+
|
|
306
|
+
```shell
|
|
307
|
+
export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
|
|
308
|
+
```
|
|
309
|
+
Then, you can use the following command to evaluate the model API service:
|
|
310
|
+
```shell
|
|
311
|
+
evalscope eval \
|
|
312
|
+
--model qwen2.5 \
|
|
313
|
+
--api-url http://127.0.0.1:8801/v1/chat/completions \
|
|
314
|
+
--api-key EMPTY \
|
|
315
|
+
--eval-type service \
|
|
316
|
+
--datasets gsm8k \
|
|
317
|
+
--limit 10
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
## ⚙️ Custom Parameter Evaluation
|
|
321
|
+
|
|
250
322
|
For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
|
|
251
323
|
|
|
252
324
|
```shell
|
|
@@ -33,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
33
33
|
# yapf: disable
|
|
34
34
|
# Model-related arguments
|
|
35
35
|
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
|
|
36
|
+
parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
|
|
36
37
|
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
|
|
37
38
|
|
|
38
39
|
# Template-related arguments
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
-
metric_list=[
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
24
24
|
few_shot_num=0,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -109,12 +109,10 @@ class ARCAdapter(DataAdapter):
|
|
|
109
109
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
110
110
|
context: str = '\n'.join(few_shot_prompts)
|
|
111
111
|
|
|
112
|
-
context = f'{self.prompt_template}\n{context}' if self.prompt_template else context
|
|
113
|
-
|
|
114
112
|
# context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
|
|
115
113
|
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
116
114
|
|
|
117
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
115
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
118
116
|
|
|
119
117
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
120
118
|
# Get the gold choice
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics import
|
|
10
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
11
11
|
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
12
|
from evalscope.utils import ResponseParser
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -63,7 +63,7 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
63
63
|
dataset_id='modelscope/bbh',
|
|
64
64
|
model_adapter=ChatGenerationModelAdapter,
|
|
65
65
|
subset_list=SUBSET_LIST,
|
|
66
|
-
metric_list=[
|
|
66
|
+
metric_list=[AverageAccuracy],
|
|
67
67
|
few_shot_num=3,
|
|
68
68
|
train_split=None,
|
|
69
69
|
eval_split='test',
|
|
@@ -122,7 +122,7 @@ class BBHAdapter(DataAdapter):
|
|
|
122
122
|
cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
|
|
123
123
|
full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
|
|
124
124
|
|
|
125
|
-
return {'data': [full_prompt]}
|
|
125
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
126
126
|
|
|
127
127
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
128
128
|
"""
|
|
@@ -22,7 +22,7 @@ class BenchmarkMeta:
|
|
|
22
22
|
few_shot_random: bool = False
|
|
23
23
|
train_split: Optional[str] = None
|
|
24
24
|
eval_split: Optional[str] = None
|
|
25
|
-
prompt_template: str =
|
|
25
|
+
prompt_template: Optional[str] = None
|
|
26
26
|
|
|
27
27
|
def _update(self, args: dict):
|
|
28
28
|
if args.get('local_path'):
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
7
|
+
from evalscope.metrics import AverageAccuracy
|
|
8
8
|
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser, normalize_score
|
|
@@ -130,7 +130,7 @@ SUBJECT_MAPPING = {
|
|
|
130
130
|
dataset_id='modelscope/ceval-exam',
|
|
131
131
|
model_adapter=MultiChoiceModelAdapter,
|
|
132
132
|
subset_list=SUBSET_LIST,
|
|
133
|
-
metric_list=[
|
|
133
|
+
metric_list=[AverageAccuracy],
|
|
134
134
|
few_shot_num=0,
|
|
135
135
|
train_split='dev',
|
|
136
136
|
eval_split='val',
|
|
@@ -145,9 +145,10 @@ class CEVALAdapter(DataAdapter):
|
|
|
145
145
|
if few_shot_num > 5:
|
|
146
146
|
logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
|
|
147
147
|
kwargs['few_shot_num'] = 5
|
|
148
|
-
|
|
149
148
|
super().__init__(**kwargs)
|
|
150
149
|
|
|
150
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
151
|
+
|
|
151
152
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
152
153
|
data_dict = {}
|
|
153
154
|
for subset_name in subset_list:
|
|
@@ -206,7 +207,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
206
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
207
208
|
full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
|
|
208
209
|
|
|
209
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
210
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
210
211
|
|
|
211
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
212
213
|
# Get the gold choice
|
|
@@ -236,84 +237,6 @@ class CEVALAdapter(DataAdapter):
|
|
|
236
237
|
def match(self, gold: str, pred: str) -> float:
|
|
237
238
|
return exact_match(gold=gold, pred=pred)
|
|
238
239
|
|
|
239
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
240
|
-
"""
|
|
241
|
-
Generate report for the evaluation.
|
|
242
|
-
|
|
243
|
-
Args:
|
|
244
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
245
|
-
report_name: The user-defined report name.
|
|
246
|
-
|
|
247
|
-
Returns:
|
|
248
|
-
{
|
|
249
|
-
"name":"C-Eval",
|
|
250
|
-
"metric":"WeightedAverageAccuracy",
|
|
251
|
-
"score":0.3389,
|
|
252
|
-
"category":[
|
|
253
|
-
{
|
|
254
|
-
"name":"STEM",
|
|
255
|
-
"score":0.2528,
|
|
256
|
-
"subset":[
|
|
257
|
-
{
|
|
258
|
-
"name":"computer_network",
|
|
259
|
-
"score":0.2632
|
|
260
|
-
},
|
|
261
|
-
{
|
|
262
|
-
"name":"operating_system",
|
|
263
|
-
"score":0.3157
|
|
264
|
-
},
|
|
265
|
-
{
|
|
266
|
-
"name":"computer_architecture",
|
|
267
|
-
"score":0.4285
|
|
268
|
-
}
|
|
269
|
-
]
|
|
270
|
-
}
|
|
271
|
-
],
|
|
272
|
-
"total_num":59
|
|
273
|
-
}
|
|
274
|
-
"""
|
|
275
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
276
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
277
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
278
|
-
|
|
279
|
-
# Get domain-subject mapping
|
|
280
|
-
subject_review_map = {}
|
|
281
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
282
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else 'DEFAULT'
|
|
283
|
-
if domain_name in subject_review_map:
|
|
284
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
285
|
-
else:
|
|
286
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
287
|
-
|
|
288
|
-
# Get domain score
|
|
289
|
-
category_list = []
|
|
290
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
291
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
292
|
-
sum([num for _, _, num in domain_res_list])
|
|
293
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
294
|
-
category_list.append({
|
|
295
|
-
'name':
|
|
296
|
-
domain_name,
|
|
297
|
-
'score':
|
|
298
|
-
domain_weighted_avg_acc,
|
|
299
|
-
'subset': [{
|
|
300
|
-
'name': subset_name,
|
|
301
|
-
'score': normalize_score(score=subset_score)
|
|
302
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
303
|
-
})
|
|
304
|
-
|
|
305
|
-
category_list = sorted(category_list, key=lambda x: x['name'])
|
|
306
|
-
|
|
307
|
-
# Get final dict of report
|
|
308
|
-
res_map = dict(
|
|
309
|
-
name=report_name or 'ceval',
|
|
310
|
-
metric=self.metric_list[0]['name'],
|
|
311
|
-
score=weighted_avg_acc,
|
|
312
|
-
category=category_list,
|
|
313
|
-
total_num=total_num)
|
|
314
|
-
|
|
315
|
-
return res_map
|
|
316
|
-
|
|
317
240
|
@classmethod
|
|
318
241
|
def _format_example(cls, input_d: dict, include_answer=True):
|
|
319
242
|
example = '问题:' + input_d['question']
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser, normalize_score
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -106,7 +106,7 @@ SUBJECT_MAPPING = {
|
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
107
|
model_adapter=MultiChoiceModelAdapter,
|
|
108
108
|
subset_list=SUBSET_LIST,
|
|
109
|
-
metric_list=[
|
|
109
|
+
metric_list=[AverageAccuracy],
|
|
110
110
|
few_shot_num=5,
|
|
111
111
|
train_split='dev',
|
|
112
112
|
eval_split='test',
|
|
@@ -116,9 +116,10 @@ class CMMLUAdapter(DataAdapter):
|
|
|
116
116
|
choices = ['A', 'B', 'C', 'D']
|
|
117
117
|
|
|
118
118
|
def __init__(self, **kwargs):
|
|
119
|
-
|
|
120
119
|
super().__init__(**kwargs)
|
|
121
120
|
|
|
121
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
122
|
+
|
|
122
123
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
123
124
|
data_dict = {}
|
|
124
125
|
for subset_name in subset_list:
|
|
@@ -173,7 +174,7 @@ class CMMLUAdapter(DataAdapter):
|
|
|
173
174
|
|
|
174
175
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
175
176
|
|
|
176
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
177
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
|
|
177
178
|
|
|
178
179
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
179
180
|
# Get the gold choice
|
|
@@ -203,81 +204,6 @@ class CMMLUAdapter(DataAdapter):
|
|
|
203
204
|
def match(self, gold: str, pred: str) -> float:
|
|
204
205
|
return exact_match(gold=gold, pred=pred)
|
|
205
206
|
|
|
206
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
207
|
-
"""
|
|
208
|
-
Generate report for the evaluation.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
212
|
-
report_name: the user-defined report name. Default: None
|
|
213
|
-
|
|
214
|
-
Returns:
|
|
215
|
-
{
|
|
216
|
-
"name":"CMMLU",
|
|
217
|
-
"metric":"WeightedAverageAccuracy",
|
|
218
|
-
"score":0.3389,
|
|
219
|
-
"category":[
|
|
220
|
-
{
|
|
221
|
-
"name":"STEM",
|
|
222
|
-
"score":0.2528,
|
|
223
|
-
"subset":[
|
|
224
|
-
{
|
|
225
|
-
"name":"computer_network",
|
|
226
|
-
"score":0.2632
|
|
227
|
-
},
|
|
228
|
-
{
|
|
229
|
-
"name":"operating_system",
|
|
230
|
-
"score":0.3157
|
|
231
|
-
},
|
|
232
|
-
{
|
|
233
|
-
"name":"computer_architecture",
|
|
234
|
-
"score":0.4285
|
|
235
|
-
}
|
|
236
|
-
]
|
|
237
|
-
}
|
|
238
|
-
],
|
|
239
|
-
"total_num":59
|
|
240
|
-
}
|
|
241
|
-
"""
|
|
242
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
243
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
244
|
-
|
|
245
|
-
# Get domain-subject mapping
|
|
246
|
-
subject_review_map = {}
|
|
247
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
248
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
249
|
-
if domain_name in subject_review_map:
|
|
250
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
251
|
-
else:
|
|
252
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
253
|
-
|
|
254
|
-
# Get domain score
|
|
255
|
-
category_list = []
|
|
256
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
257
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
258
|
-
sum([num for _, _, num in domain_res_list])
|
|
259
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
260
|
-
category_list.append({
|
|
261
|
-
'name':
|
|
262
|
-
domain_name,
|
|
263
|
-
'score':
|
|
264
|
-
domain_weighted_avg_acc,
|
|
265
|
-
'subset': [{
|
|
266
|
-
'name': subset_name,
|
|
267
|
-
'score': normalize_score(subset_score)
|
|
268
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
269
|
-
})
|
|
270
|
-
|
|
271
|
-
# Get final dict of report
|
|
272
|
-
res_map = dict(
|
|
273
|
-
name=report_name or 'cmmlu',
|
|
274
|
-
metric=self.metric_list[0]['name'],
|
|
275
|
-
score=weighted_avg_acc,
|
|
276
|
-
category=category_list,
|
|
277
|
-
total_num=total_num)
|
|
278
|
-
|
|
279
|
-
return res_map
|
|
280
|
-
|
|
281
207
|
@classmethod
|
|
282
208
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
283
209
|
|
|
@@ -5,7 +5,7 @@ import json
|
|
|
5
5
|
import os
|
|
6
6
|
|
|
7
7
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import AverageAccuracy
|
|
9
9
|
from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
|
|
10
10
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,11 +20,11 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/competition_math',
|
|
21
21
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
22
|
subset_list=['default'],
|
|
23
|
-
metric_list=[
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
24
24
|
few_shot_num=4,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
27
|
-
prompt_template='',
|
|
27
|
+
prompt_template='Put the final answer in \\boxed{}.',
|
|
28
28
|
)
|
|
29
29
|
class CompetitionMathAdapter(DataAdapter):
|
|
30
30
|
""" To be tested for all models. """
|
|
@@ -77,7 +77,7 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
77
77
|
use_fewshot = self.few_shot_num > 0
|
|
78
78
|
full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
79
79
|
|
|
80
|
-
return {'data': [full_prompt], 'system_prompt':
|
|
80
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
81
81
|
|
|
82
82
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
83
|
# Extract the gold answer from the input dict.
|