evalscope 0.10.0__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.10.0/evalscope.egg-info → evalscope-0.11.0}/PKG-INFO +20 -11
- {evalscope-0.10.0 → evalscope-0.11.0}/README.md +7 -6
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/arguments.py +1 -0
- evalscope-0.11.0/evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/arc/arc_adapter.py +5 -7
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/benchmark.py +2 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/data_adapter.py +18 -12
- evalscope-0.11.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope-0.11.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope-0.11.0/evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope-0.11.0/evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions.py +3 -4
- evalscope-0.11.0/evalscope/benchmarks/iquiz/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope-0.11.0/evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope-0.11.0/evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope-0.11.0/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/race_adapter.py +3 -3
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_app.py +3 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/evaluator.py +103 -39
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/sampler.py +2 -1
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/schema.py +1 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/config.py +1 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/evaluator.py +78 -64
- evalscope-0.11.0/evalscope/metrics/math_parser.py +526 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/metrics.py +16 -1
- evalscope-0.11.0/evalscope/metrics/named_metrics.py +41 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/chat_adapter.py +69 -47
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/choice_adapter.py +52 -45
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom_adapter.py +2 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/local_model.py +4 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/server_adapter.py +28 -34
- evalscope-0.11.0/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/app.py +298 -96
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/run.py +10 -7
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/chat_service.py +2 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/io_utils.py +1 -1
- evalscope-0.11.0/evalscope/version.py +4 -0
- {evalscope-0.10.0 → evalscope-0.11.0/evalscope.egg-info}/PKG-INFO +20 -11
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/SOURCES.txt +12 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/requires.txt +12 -4
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/app.txt +1 -1
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/framework.txt +6 -2
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/cli/test_run.py +93 -16
- evalscope-0.11.0/tests/rag/__init__.py +0 -0
- evalscope-0.10.0/evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope-0.10.0/evalscope/metrics/math_accuracy.py +0 -200
- evalscope-0.10.0/evalscope/metrics/named_metrics.py +0 -17
- evalscope-0.10.0/evalscope/version.py +0 -4
- {evalscope-0.10.0 → evalscope-0.11.0}/LICENSE +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/MANIFEST.in +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.10.0/evalscope/benchmarks/ifeval → evalscope-0.11.0/evalscope/benchmarks/aime24}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.10.0/evalscope/benchmarks/iquiz → evalscope-0.11.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
- {evalscope-0.10.0/evalscope/benchmarks/mmlu_pro → evalscope-0.11.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.10.0/evalscope/perf/utils → evalscope-0.11.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.10.0/tests/rag → evalscope-0.11.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/cli.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/constants.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/base_adapter.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/model.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/arguments.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/benchmark.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/main.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/benchmark_util.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/combinator.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/generator.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/run_arena.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/summarizer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/logger.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/utils.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/docs.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/inner.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/opencompass.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/perf.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/rag.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/tests.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements/vlmeval.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/requirements.txt +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/setup.cfg +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/setup.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/cli/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/cli/test_collection.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/perf/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/perf/test_perf.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/test_run_all.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/vlm/__init__.py +0 -0
- {evalscope-0.10.0 → evalscope-0.11.0}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -19,10 +19,12 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: absl-py
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
|
-
Requires-Dist: datasets<=3.0
|
|
22
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
23
|
Requires-Dist: editdistance
|
|
24
24
|
Requires-Dist: jieba
|
|
25
25
|
Requires-Dist: jsonlines
|
|
26
|
+
Requires-Dist: langdetect
|
|
27
|
+
Requires-Dist: latex2sympy2
|
|
26
28
|
Requires-Dist: matplotlib
|
|
27
29
|
Requires-Dist: modelscope[framework]
|
|
28
30
|
Requires-Dist: nltk>=3.9
|
|
@@ -42,12 +44,14 @@ Requires-Dist: scikit-learn
|
|
|
42
44
|
Requires-Dist: seaborn
|
|
43
45
|
Requires-Dist: sentencepiece
|
|
44
46
|
Requires-Dist: simple-ddl-parser
|
|
47
|
+
Requires-Dist: sympy
|
|
45
48
|
Requires-Dist: tabulate
|
|
46
49
|
Requires-Dist: tiktoken
|
|
47
50
|
Requires-Dist: torch
|
|
48
51
|
Requires-Dist: tqdm
|
|
49
52
|
Requires-Dist: transformers>=4.33
|
|
50
53
|
Requires-Dist: transformers_stream_generator
|
|
54
|
+
Requires-Dist: word2number
|
|
51
55
|
Provides-Extra: opencompass
|
|
52
56
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
|
|
53
57
|
Provides-Extra: vlmeval
|
|
@@ -64,7 +68,7 @@ Requires-Dist: sse_starlette; extra == "perf"
|
|
|
64
68
|
Requires-Dist: transformers; extra == "perf"
|
|
65
69
|
Requires-Dist: unicorn; extra == "perf"
|
|
66
70
|
Provides-Extra: app
|
|
67
|
-
Requires-Dist: gradio
|
|
71
|
+
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
68
72
|
Requires-Dist: plotly>=5.23.0; extra == "app"
|
|
69
73
|
Provides-Extra: inner
|
|
70
74
|
Requires-Dist: absl-py; extra == "inner"
|
|
@@ -96,10 +100,12 @@ Provides-Extra: all
|
|
|
96
100
|
Requires-Dist: absl-py; extra == "all"
|
|
97
101
|
Requires-Dist: accelerate; extra == "all"
|
|
98
102
|
Requires-Dist: cachetools; extra == "all"
|
|
99
|
-
Requires-Dist: datasets<=3.0
|
|
103
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
100
104
|
Requires-Dist: editdistance; extra == "all"
|
|
101
105
|
Requires-Dist: jieba; extra == "all"
|
|
102
106
|
Requires-Dist: jsonlines; extra == "all"
|
|
107
|
+
Requires-Dist: langdetect; extra == "all"
|
|
108
|
+
Requires-Dist: latex2sympy2; extra == "all"
|
|
103
109
|
Requires-Dist: matplotlib; extra == "all"
|
|
104
110
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
105
111
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -119,12 +125,14 @@ Requires-Dist: scikit-learn; extra == "all"
|
|
|
119
125
|
Requires-Dist: seaborn; extra == "all"
|
|
120
126
|
Requires-Dist: sentencepiece; extra == "all"
|
|
121
127
|
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
128
|
+
Requires-Dist: sympy; extra == "all"
|
|
122
129
|
Requires-Dist: tabulate; extra == "all"
|
|
123
130
|
Requires-Dist: tiktoken; extra == "all"
|
|
124
131
|
Requires-Dist: torch; extra == "all"
|
|
125
132
|
Requires-Dist: tqdm; extra == "all"
|
|
126
133
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
127
134
|
Requires-Dist: transformers_stream_generator; extra == "all"
|
|
135
|
+
Requires-Dist: word2number; extra == "all"
|
|
128
136
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
129
137
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
130
138
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
@@ -136,7 +144,7 @@ Requires-Dist: numpy; extra == "all"
|
|
|
136
144
|
Requires-Dist: sse_starlette; extra == "all"
|
|
137
145
|
Requires-Dist: transformers; extra == "all"
|
|
138
146
|
Requires-Dist: unicorn; extra == "all"
|
|
139
|
-
Requires-Dist: gradio
|
|
147
|
+
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
140
148
|
Requires-Dist: plotly>=5.23.0; extra == "all"
|
|
141
149
|
|
|
142
150
|
<p align="center">
|
|
@@ -215,7 +223,8 @@ Please scan the QR code below to join our community groups:
|
|
|
215
223
|
|
|
216
224
|
|
|
217
225
|
## 🎉 News
|
|
218
|
-
- 🔥 **[2025.
|
|
226
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
227
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
219
228
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
220
229
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
221
230
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
@@ -416,27 +425,27 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
416
425
|
<table>
|
|
417
426
|
<tr>
|
|
418
427
|
<td style="text-align: center;">
|
|
419
|
-
<img src="docs/
|
|
428
|
+
<img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
|
|
420
429
|
<p>Setting Interface</p>
|
|
421
430
|
</td>
|
|
422
431
|
<td style="text-align: center;">
|
|
423
|
-
<img src="docs/
|
|
432
|
+
<img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
424
433
|
<p>Model Comparison</p>
|
|
425
434
|
</td>
|
|
426
435
|
</tr>
|
|
427
436
|
<tr>
|
|
428
437
|
<td style="text-align: center;">
|
|
429
|
-
<img src="docs/
|
|
438
|
+
<img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
430
439
|
<p>Report Overview</p>
|
|
431
440
|
</td>
|
|
432
441
|
<td style="text-align: center;">
|
|
433
|
-
<img src="docs/
|
|
442
|
+
<img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
|
|
434
443
|
<p>Report Details</p>
|
|
435
444
|
</td>
|
|
436
445
|
</tr>
|
|
437
446
|
</table>
|
|
438
447
|
|
|
439
|
-
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/
|
|
448
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
440
449
|
|
|
441
450
|
## 🌐 Evaluation of Specified Model API
|
|
442
451
|
|
|
@@ -74,7 +74,8 @@ Please scan the QR code below to join our community groups:
|
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
## 🎉 News
|
|
77
|
-
- 🔥 **[2025.
|
|
77
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
78
|
+
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
78
79
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
79
80
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
80
81
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
@@ -275,27 +276,27 @@ To create a public link, set `share=True` in `launch()`.
|
|
|
275
276
|
<table>
|
|
276
277
|
<tr>
|
|
277
278
|
<td style="text-align: center;">
|
|
278
|
-
<img src="docs/
|
|
279
|
+
<img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
|
|
279
280
|
<p>Setting Interface</p>
|
|
280
281
|
</td>
|
|
281
282
|
<td style="text-align: center;">
|
|
282
|
-
<img src="docs/
|
|
283
|
+
<img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
|
|
283
284
|
<p>Model Comparison</p>
|
|
284
285
|
</td>
|
|
285
286
|
</tr>
|
|
286
287
|
<tr>
|
|
287
288
|
<td style="text-align: center;">
|
|
288
|
-
<img src="docs/
|
|
289
|
+
<img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
|
|
289
290
|
<p>Report Overview</p>
|
|
290
291
|
</td>
|
|
291
292
|
<td style="text-align: center;">
|
|
292
|
-
<img src="docs/
|
|
293
|
+
<img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
|
|
293
294
|
<p>Report Details</p>
|
|
294
295
|
</td>
|
|
295
296
|
</tr>
|
|
296
297
|
</table>
|
|
297
298
|
|
|
298
|
-
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/
|
|
299
|
+
For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
|
|
299
300
|
|
|
300
301
|
## 🌐 Evaluation of Specified Model API
|
|
301
302
|
|
|
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
58
58
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
59
59
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
60
60
|
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
|
|
61
|
+
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
61
62
|
|
|
62
63
|
# Cache and working directory arguments
|
|
63
64
|
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='aime24',
|
|
13
|
+
dataset_id='HuggingFaceH4/aime_2024',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
17
|
+
few_shot_num=0,
|
|
18
|
+
train_split=None,
|
|
19
|
+
eval_split='train', # Only train set is available
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
|
+
)
|
|
22
|
+
class AIME24Adapter(DataAdapter):
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Generate the prompt for the model input.
|
|
30
|
+
"""
|
|
31
|
+
problem = input_d['problem']
|
|
32
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
33
|
+
|
|
34
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
# Extract the gold answer from the input dict.
|
|
38
|
+
return strip_answer_string(input_d['answer'])
|
|
39
|
+
|
|
40
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
43
|
+
"""
|
|
44
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
45
|
+
result = strip_answer_string(extract_answer(result))
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def match(self, gold: str, pred: str) -> float:
|
|
49
|
+
return math_equal(pred, gold)
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
24
|
few_shot_num=0,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
|
|
|
112
112
|
# context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
|
|
113
113
|
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
114
114
|
|
|
115
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
115
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
116
116
|
|
|
117
117
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
118
118
|
# Get the gold choice
|
|
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
|
|
|
133
133
|
if eval_type == EvalType.CHECKPOINT:
|
|
134
134
|
return result
|
|
135
135
|
elif eval_type == EvalType.SERVICE:
|
|
136
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
137
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
136
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
138
137
|
elif eval_type == EvalType.CUSTOM:
|
|
139
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
140
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
138
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
141
139
|
else:
|
|
142
140
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
143
141
|
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics import
|
|
10
|
+
from evalscope.metrics import exact_match
|
|
11
11
|
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
12
|
from evalscope.utils import ResponseParser
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
63
63
|
dataset_id='modelscope/bbh',
|
|
64
64
|
model_adapter=ChatGenerationModelAdapter,
|
|
65
65
|
subset_list=SUBSET_LIST,
|
|
66
|
-
metric_list=[AverageAccuracy],
|
|
66
|
+
metric_list=['AverageAccuracy'],
|
|
67
67
|
few_shot_num=3,
|
|
68
68
|
train_split=None,
|
|
69
69
|
eval_split='test',
|
|
70
|
-
prompt_template='
|
|
70
|
+
prompt_template="Q: {query}\nA: Let's think step by step.",
|
|
71
71
|
)
|
|
72
72
|
class BBHAdapter(DataAdapter):
|
|
73
73
|
"""
|
|
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
|
|
|
119
119
|
{'data': ['xxx']}
|
|
120
120
|
"""
|
|
121
121
|
# few_shot_list: should be ['xxxx']
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
if len(few_shot_list) > 0:
|
|
123
|
+
cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
|
|
124
|
+
else:
|
|
125
|
+
cot_prompts = ''
|
|
126
|
+
full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
|
|
124
127
|
|
|
125
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
128
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
126
129
|
|
|
127
130
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
128
131
|
"""
|
|
@@ -177,9 +180,11 @@ class BBHAdapter(DataAdapter):
|
|
|
177
180
|
|
|
178
181
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
179
182
|
# Get the gold choice
|
|
180
|
-
gold = input_d.get('target')
|
|
183
|
+
gold = input_d.get('target', '')
|
|
184
|
+
# remove brackets
|
|
181
185
|
if gold is None:
|
|
182
186
|
logger.error(f'BBHAdapter: gold is None.')
|
|
187
|
+
gold = gold.replace('(', '').replace(')', '')
|
|
183
188
|
return gold
|
|
184
189
|
|
|
185
190
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
@@ -228,8 +233,11 @@ class BBHAdapter(DataAdapter):
|
|
|
228
233
|
"""
|
|
229
234
|
Extract the answer from the model output for Free-form task.
|
|
230
235
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
236
|
+
pattern = r'answer is\s+(.*?)\.'
|
|
237
|
+
|
|
238
|
+
match = re.search(pattern, ans)
|
|
239
|
+
if match:
|
|
240
|
+
res = match.group(1)
|
|
233
241
|
return res
|
|
234
242
|
|
|
235
243
|
ans_line = ans.split('answer is ')
|
|
@@ -17,12 +17,13 @@ class BenchmarkMeta:
|
|
|
17
17
|
data_adapter: 'DataAdapter'
|
|
18
18
|
model_adapter: BaseModelAdapter
|
|
19
19
|
subset_list: List[str] = field(default_factory=list)
|
|
20
|
-
metric_list: List[
|
|
20
|
+
metric_list: List[str] = field(default_factory=list)
|
|
21
21
|
few_shot_num: int = 0
|
|
22
22
|
few_shot_random: bool = False
|
|
23
23
|
train_split: Optional[str] = None
|
|
24
24
|
eval_split: Optional[str] = None
|
|
25
25
|
prompt_template: Optional[str] = None
|
|
26
|
+
system_prompt: Optional[str] = None
|
|
26
27
|
|
|
27
28
|
def _update(self, args: dict):
|
|
28
29
|
if args.get('local_path'):
|
|
@@ -40,7 +41,6 @@ class BenchmarkMeta:
|
|
|
40
41
|
# cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
|
|
41
42
|
del cur_dict['data_adapter']
|
|
42
43
|
del cur_dict['model_adapter']
|
|
43
|
-
del cur_dict['metric_list']
|
|
44
44
|
return cur_dict
|
|
45
45
|
|
|
46
46
|
def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
|
|
@@ -4,10 +4,9 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
8
|
-
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
|
+
from evalscope.metrics.metrics import exact_match
|
|
9
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
13
12
|
# flake8: noqa
|
|
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
|
|
|
130
129
|
dataset_id='modelscope/ceval-exam',
|
|
131
130
|
model_adapter=MultiChoiceModelAdapter,
|
|
132
131
|
subset_list=SUBSET_LIST,
|
|
133
|
-
metric_list=[AverageAccuracy],
|
|
132
|
+
metric_list=['AverageAccuracy'],
|
|
134
133
|
few_shot_num=0,
|
|
135
134
|
train_split='dev',
|
|
136
135
|
eval_split='val',
|
|
136
|
+
prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
|
|
137
137
|
)
|
|
138
138
|
class CEVALAdapter(DataAdapter):
|
|
139
139
|
|
|
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
|
|
|
202
202
|
else:
|
|
203
203
|
context = ''
|
|
204
204
|
|
|
205
|
-
|
|
205
|
+
query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
206
206
|
|
|
207
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
208
|
-
full_prompt =
|
|
208
|
+
full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
|
|
209
209
|
|
|
210
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
210
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
211
211
|
|
|
212
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
213
213
|
# Get the gold choice
|
|
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
|
|
|
228
228
|
if eval_type == EvalType.CHECKPOINT:
|
|
229
229
|
return result
|
|
230
230
|
elif eval_type == EvalType.SERVICE:
|
|
231
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
231
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
232
232
|
elif eval_type == EvalType.CUSTOM:
|
|
233
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
233
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
234
234
|
else:
|
|
235
235
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
236
236
|
|
|
@@ -5,9 +5,9 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
10
|
+
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
|
|
13
13
|
# flake8: noqa
|
|
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
|
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
107
|
model_adapter=MultiChoiceModelAdapter,
|
|
108
108
|
subset_list=SUBSET_LIST,
|
|
109
|
-
metric_list=[AverageAccuracy],
|
|
109
|
+
metric_list=['AverageAccuracy'],
|
|
110
110
|
few_shot_num=5,
|
|
111
111
|
train_split='dev',
|
|
112
112
|
eval_split='test',
|
|
113
|
+
prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
|
|
113
114
|
)
|
|
114
115
|
class CMMLUAdapter(DataAdapter):
|
|
115
116
|
|
|
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
|
|
|
165
166
|
{'data': [(context, continuation), ...]}
|
|
166
167
|
|
|
167
168
|
"""
|
|
168
|
-
prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
|
|
169
169
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
170
|
-
|
|
171
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
170
|
+
context = '\n'.join(few_shot_prompts) + '\n'
|
|
172
171
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
173
|
-
context = prompt + context
|
|
174
172
|
|
|
175
|
-
full_prompt
|
|
173
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
176
174
|
|
|
177
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt':
|
|
175
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
178
176
|
|
|
179
177
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
180
178
|
# Get the gold choice
|
|
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
|
|
|
195
193
|
if eval_type == EvalType.CHECKPOINT:
|
|
196
194
|
return result
|
|
197
195
|
elif eval_type == EvalType.SERVICE:
|
|
198
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
196
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
199
197
|
elif eval_type == EvalType.CUSTOM:
|
|
200
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
198
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
201
199
|
else:
|
|
202
200
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
203
201
|
|
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
import glob
|
|
4
4
|
import json
|
|
5
5
|
import os
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
|
|
7
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.metrics.
|
|
9
|
+
from evalscope.constants import AnswerKeys
|
|
10
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
10
11
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
@@ -19,12 +20,12 @@ logger = get_logger()
|
|
|
19
20
|
name='competition_math',
|
|
20
21
|
dataset_id='modelscope/competition_math',
|
|
21
22
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
|
-
subset_list=['
|
|
23
|
-
metric_list=[
|
|
23
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
24
|
+
metric_list=['AveragePass@1'],
|
|
24
25
|
few_shot_num=4,
|
|
25
26
|
train_split='train',
|
|
26
27
|
eval_split='test',
|
|
27
|
-
prompt_template='
|
|
28
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
28
29
|
)
|
|
29
30
|
class CompetitionMathAdapter(DataAdapter):
|
|
30
31
|
""" To be tested for all models. """
|
|
@@ -39,8 +40,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
39
40
|
|
|
40
41
|
super().__init__(**kwargs)
|
|
41
42
|
|
|
43
|
+
def load(self, **kwargs):
|
|
44
|
+
# default load all levels
|
|
45
|
+
kwargs['subset_list'] = ['default']
|
|
46
|
+
return super().load(**kwargs)
|
|
47
|
+
|
|
42
48
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
43
|
-
data_dict
|
|
49
|
+
data_dict = defaultdict(dict)
|
|
44
50
|
for subset_name in subset_list:
|
|
45
51
|
for split_name in [self.train_split, self.eval_split]:
|
|
46
52
|
if os.path.exists(dataset_name_or_path):
|
|
@@ -53,13 +59,25 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
53
59
|
if os.path.exists(file_path):
|
|
54
60
|
with open(file_path, 'r') as f:
|
|
55
61
|
split_data.append(json.load(f))
|
|
56
|
-
|
|
57
|
-
data_dict[subset_name].update({split_name: split_data})
|
|
58
|
-
else:
|
|
59
|
-
data_dict[subset_name] = {split_name: split_data}
|
|
62
|
+
data_dict[subset_name][split_name] = split_data
|
|
60
63
|
|
|
61
64
|
return data_dict
|
|
62
65
|
|
|
66
|
+
def gen_prompts(self, data_dict: dict) -> dict:
|
|
67
|
+
res_dict: dict = defaultdict(list)
|
|
68
|
+
|
|
69
|
+
# use level as subset
|
|
70
|
+
for sub_name, sub_data_dict in data_dict.items():
|
|
71
|
+
for sample_d in sub_data_dict[self.eval_split]:
|
|
72
|
+
level = sample_d['level']
|
|
73
|
+
if level not in self.subset_list:
|
|
74
|
+
continue
|
|
75
|
+
prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=None)
|
|
76
|
+
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
77
|
+
res_dict[level].append(prompt_d)
|
|
78
|
+
|
|
79
|
+
return res_dict
|
|
80
|
+
|
|
63
81
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
64
82
|
"""
|
|
65
83
|
Generate the prompt for the model input.
|
|
@@ -75,13 +93,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
75
93
|
{'data': [prompt]}
|
|
76
94
|
"""
|
|
77
95
|
use_fewshot = self.few_shot_num > 0
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
96
|
+
query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
97
|
+
full_prompt = self.prompt_template.format(query=query)
|
|
98
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
81
99
|
|
|
82
100
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
101
|
# Extract the gold answer from the input dict.
|
|
84
|
-
return
|
|
102
|
+
return strip_answer_string(extract_answer(input_d['solution']))
|
|
85
103
|
|
|
86
104
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
87
105
|
"""
|
|
@@ -96,18 +114,11 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
96
114
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
97
115
|
"""
|
|
98
116
|
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
99
|
-
|
|
100
|
-
result = remove_boxed(last_boxed_only_string(result))
|
|
101
|
-
except Exception:
|
|
102
|
-
return None
|
|
117
|
+
result = strip_answer_string(extract_answer(result))
|
|
103
118
|
return result
|
|
104
119
|
|
|
105
120
|
def match(self, gold: str, pred: str) -> float:
|
|
106
|
-
|
|
107
|
-
if is_equiv(pred, gold):
|
|
108
|
-
res = 1
|
|
109
|
-
|
|
110
|
-
return res
|
|
121
|
+
return math_equal(pred, gold)
|
|
111
122
|
|
|
112
123
|
@classmethod
|
|
113
124
|
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
|