evalscope 0.10.1__tar.gz → 0.11.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- {evalscope-0.10.1/evalscope.egg-info → evalscope-0.11.0}/PKG-INFO +14 -5
- {evalscope-0.10.1 → evalscope-0.11.0}/README.md +1 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/arguments.py +1 -0
- evalscope-0.11.0/evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/arc/arc_adapter.py +5 -7
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/benchmark.py +2 -2
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/data_adapter.py +18 -12
- evalscope-0.11.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope-0.11.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope-0.11.0/evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope-0.11.0/evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope-0.11.0/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/race_adapter.py +3 -3
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/evaluator.py +103 -39
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/sampler.py +2 -1
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/schema.py +1 -2
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/config.py +1 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/evaluator.py +78 -64
- evalscope-0.11.0/evalscope/metrics/math_parser.py +526 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/metrics.py +16 -1
- evalscope-0.11.0/evalscope/metrics/named_metrics.py +41 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/chat_adapter.py +69 -49
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/choice_adapter.py +52 -45
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom_adapter.py +2 -2
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/local_model.py +4 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/server_adapter.py +28 -34
- evalscope-0.11.0/evalscope/perf/utils/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/app.py +30 -15
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/run.py +10 -7
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/chat_service.py +2 -2
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/io_utils.py +1 -1
- evalscope-0.11.0/evalscope/version.py +4 -0
- {evalscope-0.10.1 → evalscope-0.11.0/evalscope.egg-info}/PKG-INFO +14 -5
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/SOURCES.txt +9 -2
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/requires.txt +12 -4
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/app.txt +1 -1
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/framework.txt +6 -2
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/cli/test_run.py +93 -16
- evalscope-0.11.0/tests/rag/__init__.py +0 -0
- evalscope-0.10.1/evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope-0.10.1/evalscope/metrics/math_accuracy.py +0 -200
- evalscope-0.10.1/evalscope/metrics/named_metrics.py +0 -17
- evalscope-0.10.1/evalscope/version.py +0 -4
- {evalscope-0.10.1 → evalscope-0.11.0}/LICENSE +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/MANIFEST.in +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/base.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/__init__.py +0 -0
- {evalscope-0.10.1/evalscope/benchmarks/gpqa → evalscope-0.11.0/evalscope/benchmarks/aime24}/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/arc/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
- {evalscope-0.10.1/evalscope/benchmarks/ifeval → evalscope-0.11.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
- {evalscope-0.10.1/evalscope/benchmarks/iquiz → evalscope-0.11.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
- {evalscope-0.10.1/evalscope/benchmarks/mmlu_pro → evalscope-0.11.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
- {evalscope-0.10.1/evalscope/perf/utils → evalscope-0.11.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
- {evalscope-0.10.1/tests/rag → evalscope-0.11.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/race.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/base.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/cli.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_app.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_eval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_perf.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_server.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/constants.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/rating_eval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/code_metric.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/rouge_metric.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/base_adapter.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom/custom_model.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom/dummy_model.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/model.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/arguments.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/benchmark.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/http_client.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/main.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/base.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/base.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/registry.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/analysis_result.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/benchmark_util.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/db_util.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/handler.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/local_server.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_single.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/question.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/arc.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/bbh.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/ceval.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/combinator.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/generator.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/utils.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/run_arena.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/summarizer.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/README.md +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/eval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/infer.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/utils.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/README.md +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/arena_utils.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/completion_parsers.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/logger.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/model_utils.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/utils.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/dependency_links.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/entry_points.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/not-zip-safe +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/top_level.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/docs.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/inner.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/opencompass.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/perf.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/rag.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/tests.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements/vlmeval.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/requirements.txt +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/setup.cfg +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/setup.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/cli/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/cli/test_collection.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/perf/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/perf/test_perf.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/rag/test_clip_benchmark.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/rag/test_mteb.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/rag/test_ragas.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/test_run_swift_eval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/test_run_all.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/vlm/__init__.py +0 -0
- {evalscope-0.10.1 → evalscope-0.11.0}/tests/vlm/test_vlmeval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -19,10 +19,12 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: absl-py
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
|
-
Requires-Dist: datasets<=3.0
|
|
22
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
23
|
Requires-Dist: editdistance
|
|
24
24
|
Requires-Dist: jieba
|
|
25
25
|
Requires-Dist: jsonlines
|
|
26
|
+
Requires-Dist: langdetect
|
|
27
|
+
Requires-Dist: latex2sympy2
|
|
26
28
|
Requires-Dist: matplotlib
|
|
27
29
|
Requires-Dist: modelscope[framework]
|
|
28
30
|
Requires-Dist: nltk>=3.9
|
|
@@ -42,12 +44,14 @@ Requires-Dist: scikit-learn
|
|
|
42
44
|
Requires-Dist: seaborn
|
|
43
45
|
Requires-Dist: sentencepiece
|
|
44
46
|
Requires-Dist: simple-ddl-parser
|
|
47
|
+
Requires-Dist: sympy
|
|
45
48
|
Requires-Dist: tabulate
|
|
46
49
|
Requires-Dist: tiktoken
|
|
47
50
|
Requires-Dist: torch
|
|
48
51
|
Requires-Dist: tqdm
|
|
49
52
|
Requires-Dist: transformers>=4.33
|
|
50
53
|
Requires-Dist: transformers_stream_generator
|
|
54
|
+
Requires-Dist: word2number
|
|
51
55
|
Provides-Extra: opencompass
|
|
52
56
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
|
|
53
57
|
Provides-Extra: vlmeval
|
|
@@ -64,7 +68,7 @@ Requires-Dist: sse_starlette; extra == "perf"
|
|
|
64
68
|
Requires-Dist: transformers; extra == "perf"
|
|
65
69
|
Requires-Dist: unicorn; extra == "perf"
|
|
66
70
|
Provides-Extra: app
|
|
67
|
-
Requires-Dist: gradio
|
|
71
|
+
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
68
72
|
Requires-Dist: plotly>=5.23.0; extra == "app"
|
|
69
73
|
Provides-Extra: inner
|
|
70
74
|
Requires-Dist: absl-py; extra == "inner"
|
|
@@ -96,10 +100,12 @@ Provides-Extra: all
|
|
|
96
100
|
Requires-Dist: absl-py; extra == "all"
|
|
97
101
|
Requires-Dist: accelerate; extra == "all"
|
|
98
102
|
Requires-Dist: cachetools; extra == "all"
|
|
99
|
-
Requires-Dist: datasets<=3.0
|
|
103
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
100
104
|
Requires-Dist: editdistance; extra == "all"
|
|
101
105
|
Requires-Dist: jieba; extra == "all"
|
|
102
106
|
Requires-Dist: jsonlines; extra == "all"
|
|
107
|
+
Requires-Dist: langdetect; extra == "all"
|
|
108
|
+
Requires-Dist: latex2sympy2; extra == "all"
|
|
103
109
|
Requires-Dist: matplotlib; extra == "all"
|
|
104
110
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
105
111
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -119,12 +125,14 @@ Requires-Dist: scikit-learn; extra == "all"
|
|
|
119
125
|
Requires-Dist: seaborn; extra == "all"
|
|
120
126
|
Requires-Dist: sentencepiece; extra == "all"
|
|
121
127
|
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
128
|
+
Requires-Dist: sympy; extra == "all"
|
|
122
129
|
Requires-Dist: tabulate; extra == "all"
|
|
123
130
|
Requires-Dist: tiktoken; extra == "all"
|
|
124
131
|
Requires-Dist: torch; extra == "all"
|
|
125
132
|
Requires-Dist: tqdm; extra == "all"
|
|
126
133
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
127
134
|
Requires-Dist: transformers_stream_generator; extra == "all"
|
|
135
|
+
Requires-Dist: word2number; extra == "all"
|
|
128
136
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
129
137
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
130
138
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
@@ -136,7 +144,7 @@ Requires-Dist: numpy; extra == "all"
|
|
|
136
144
|
Requires-Dist: sse_starlette; extra == "all"
|
|
137
145
|
Requires-Dist: transformers; extra == "all"
|
|
138
146
|
Requires-Dist: unicorn; extra == "all"
|
|
139
|
-
Requires-Dist: gradio
|
|
147
|
+
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
140
148
|
Requires-Dist: plotly>=5.23.0; extra == "all"
|
|
141
149
|
|
|
142
150
|
<p align="center">
|
|
@@ -215,6 +223,7 @@ Please scan the QR code below to join our community groups:
|
|
|
215
223
|
|
|
216
224
|
|
|
217
225
|
## 🎉 News
|
|
226
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
218
227
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
219
228
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
220
229
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
@@ -74,6 +74,7 @@ Please scan the QR code below to join our community groups:
|
|
|
74
74
|
|
|
75
75
|
|
|
76
76
|
## 🎉 News
|
|
77
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
77
78
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
78
79
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
79
80
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
58
58
|
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
59
59
|
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
60
60
|
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
|
|
61
|
+
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
61
62
|
|
|
62
63
|
# Cache and working directory arguments
|
|
63
64
|
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
3
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
# flake8: noqa
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='aime24',
|
|
13
|
+
dataset_id='HuggingFaceH4/aime_2024',
|
|
14
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
15
|
+
subset_list=['default'],
|
|
16
|
+
metric_list=['AveragePass@1'],
|
|
17
|
+
few_shot_num=0,
|
|
18
|
+
train_split=None,
|
|
19
|
+
eval_split='train', # Only train set is available
|
|
20
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
|
+
)
|
|
22
|
+
class AIME24Adapter(DataAdapter):
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
26
|
+
|
|
27
|
+
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Generate the prompt for the model input.
|
|
30
|
+
"""
|
|
31
|
+
problem = input_d['problem']
|
|
32
|
+
full_prompt = self.prompt_template.format(query=problem)
|
|
33
|
+
|
|
34
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
# Extract the gold answer from the input dict.
|
|
38
|
+
return strip_answer_string(input_d['answer'])
|
|
39
|
+
|
|
40
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the model output to get the answer. Could be the best choice index.
|
|
43
|
+
"""
|
|
44
|
+
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
45
|
+
result = strip_answer_string(extract_answer(result))
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def match(self, gold: str, pred: str) -> float:
|
|
49
|
+
return math_equal(pred, gold)
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
10
|
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,7 +20,7 @@ logger = get_logger()
|
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
21
|
model_adapter=MultiChoiceModelAdapter,
|
|
22
22
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
23
|
-
metric_list=[AverageAccuracy],
|
|
23
|
+
metric_list=['AverageAccuracy'],
|
|
24
24
|
few_shot_num=0,
|
|
25
25
|
train_split='train',
|
|
26
26
|
eval_split='test',
|
|
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
|
|
|
112
112
|
# context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
|
|
113
113
|
full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
114
114
|
|
|
115
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
115
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
116
116
|
|
|
117
117
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
118
118
|
# Get the gold choice
|
|
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
|
|
|
133
133
|
if eval_type == EvalType.CHECKPOINT:
|
|
134
134
|
return result
|
|
135
135
|
elif eval_type == EvalType.SERVICE:
|
|
136
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
137
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
136
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
138
137
|
elif eval_type == EvalType.CUSTOM:
|
|
139
|
-
return ResponseParser.parse_first_option_with_choices(
|
|
140
|
-
text=result, options=self.choices) # TODO: to be checked !
|
|
138
|
+
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
141
139
|
else:
|
|
142
140
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
143
141
|
|
|
@@ -7,7 +7,7 @@ import re
|
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
9
|
from evalscope.constants import AnswerKeys
|
|
10
|
-
from evalscope.metrics import
|
|
10
|
+
from evalscope.metrics import exact_match
|
|
11
11
|
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
|
|
12
12
|
from evalscope.utils import ResponseParser
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
|
|
|
63
63
|
dataset_id='modelscope/bbh',
|
|
64
64
|
model_adapter=ChatGenerationModelAdapter,
|
|
65
65
|
subset_list=SUBSET_LIST,
|
|
66
|
-
metric_list=[AverageAccuracy],
|
|
66
|
+
metric_list=['AverageAccuracy'],
|
|
67
67
|
few_shot_num=3,
|
|
68
68
|
train_split=None,
|
|
69
69
|
eval_split='test',
|
|
70
|
-
prompt_template='
|
|
70
|
+
prompt_template="Q: {query}\nA: Let's think step by step.",
|
|
71
71
|
)
|
|
72
72
|
class BBHAdapter(DataAdapter):
|
|
73
73
|
"""
|
|
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
|
|
|
119
119
|
{'data': ['xxx']}
|
|
120
120
|
"""
|
|
121
121
|
# few_shot_list: should be ['xxxx']
|
|
122
|
-
|
|
123
|
-
|
|
122
|
+
if len(few_shot_list) > 0:
|
|
123
|
+
cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
|
|
124
|
+
else:
|
|
125
|
+
cot_prompts = ''
|
|
126
|
+
full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
|
|
124
127
|
|
|
125
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
128
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
126
129
|
|
|
127
130
|
def gen_prompts(self, data_dict: dict) -> dict:
|
|
128
131
|
"""
|
|
@@ -177,9 +180,11 @@ class BBHAdapter(DataAdapter):
|
|
|
177
180
|
|
|
178
181
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
179
182
|
# Get the gold choice
|
|
180
|
-
gold = input_d.get('target')
|
|
183
|
+
gold = input_d.get('target', '')
|
|
184
|
+
# remove brackets
|
|
181
185
|
if gold is None:
|
|
182
186
|
logger.error(f'BBHAdapter: gold is None.')
|
|
187
|
+
gold = gold.replace('(', '').replace(')', '')
|
|
183
188
|
return gold
|
|
184
189
|
|
|
185
190
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
@@ -228,8 +233,11 @@ class BBHAdapter(DataAdapter):
|
|
|
228
233
|
"""
|
|
229
234
|
Extract the answer from the model output for Free-form task.
|
|
230
235
|
"""
|
|
231
|
-
|
|
232
|
-
|
|
236
|
+
pattern = r'answer is\s+(.*?)\.'
|
|
237
|
+
|
|
238
|
+
match = re.search(pattern, ans)
|
|
239
|
+
if match:
|
|
240
|
+
res = match.group(1)
|
|
233
241
|
return res
|
|
234
242
|
|
|
235
243
|
ans_line = ans.split('answer is ')
|
|
@@ -17,12 +17,13 @@ class BenchmarkMeta:
|
|
|
17
17
|
data_adapter: 'DataAdapter'
|
|
18
18
|
model_adapter: BaseModelAdapter
|
|
19
19
|
subset_list: List[str] = field(default_factory=list)
|
|
20
|
-
metric_list: List[
|
|
20
|
+
metric_list: List[str] = field(default_factory=list)
|
|
21
21
|
few_shot_num: int = 0
|
|
22
22
|
few_shot_random: bool = False
|
|
23
23
|
train_split: Optional[str] = None
|
|
24
24
|
eval_split: Optional[str] = None
|
|
25
25
|
prompt_template: Optional[str] = None
|
|
26
|
+
system_prompt: Optional[str] = None
|
|
26
27
|
|
|
27
28
|
def _update(self, args: dict):
|
|
28
29
|
if args.get('local_path'):
|
|
@@ -40,7 +41,6 @@ class BenchmarkMeta:
|
|
|
40
41
|
# cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
|
|
41
42
|
del cur_dict['data_adapter']
|
|
42
43
|
del cur_dict['model_adapter']
|
|
43
|
-
del cur_dict['metric_list']
|
|
44
44
|
return cur_dict
|
|
45
45
|
|
|
46
46
|
def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
|
|
@@ -4,10 +4,9 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType
|
|
7
|
-
from evalscope.metrics import
|
|
8
|
-
from evalscope.metrics.metrics import exact_match, weighted_mean
|
|
7
|
+
from evalscope.metrics.metrics import exact_match
|
|
9
8
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
13
12
|
# flake8: noqa
|
|
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
|
|
|
130
129
|
dataset_id='modelscope/ceval-exam',
|
|
131
130
|
model_adapter=MultiChoiceModelAdapter,
|
|
132
131
|
subset_list=SUBSET_LIST,
|
|
133
|
-
metric_list=[AverageAccuracy],
|
|
132
|
+
metric_list=['AverageAccuracy'],
|
|
134
133
|
few_shot_num=0,
|
|
135
134
|
train_split='dev',
|
|
136
135
|
eval_split='val',
|
|
136
|
+
prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
|
|
137
137
|
)
|
|
138
138
|
class CEVALAdapter(DataAdapter):
|
|
139
139
|
|
|
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
|
|
|
202
202
|
else:
|
|
203
203
|
context = ''
|
|
204
204
|
|
|
205
|
-
|
|
205
|
+
query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
206
206
|
|
|
207
207
|
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
208
|
-
full_prompt =
|
|
208
|
+
full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
|
|
209
209
|
|
|
210
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.
|
|
210
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
211
211
|
|
|
212
212
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
213
213
|
# Get the gold choice
|
|
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
|
|
|
228
228
|
if eval_type == EvalType.CHECKPOINT:
|
|
229
229
|
return result
|
|
230
230
|
elif eval_type == EvalType.SERVICE:
|
|
231
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
231
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
232
232
|
elif eval_type == EvalType.CUSTOM:
|
|
233
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
233
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
234
234
|
else:
|
|
235
235
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
236
236
|
|
|
@@ -5,9 +5,9 @@ import os
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import exact_match
|
|
9
9
|
from evalscope.models import MultiChoiceModelAdapter
|
|
10
|
-
from evalscope.utils import ResponseParser
|
|
10
|
+
from evalscope.utils import ResponseParser
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
|
|
13
13
|
# flake8: noqa
|
|
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
|
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
107
|
model_adapter=MultiChoiceModelAdapter,
|
|
108
108
|
subset_list=SUBSET_LIST,
|
|
109
|
-
metric_list=[AverageAccuracy],
|
|
109
|
+
metric_list=['AverageAccuracy'],
|
|
110
110
|
few_shot_num=5,
|
|
111
111
|
train_split='dev',
|
|
112
112
|
eval_split='test',
|
|
113
|
+
prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
|
|
113
114
|
)
|
|
114
115
|
class CMMLUAdapter(DataAdapter):
|
|
115
116
|
|
|
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
|
|
|
165
166
|
{'data': [(context, continuation), ...]}
|
|
166
167
|
|
|
167
168
|
"""
|
|
168
|
-
prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
|
|
169
169
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
170
|
-
|
|
171
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
170
|
+
context = '\n'.join(few_shot_prompts) + '\n'
|
|
172
171
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
173
|
-
context = prompt + context
|
|
174
172
|
|
|
175
|
-
full_prompt
|
|
173
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
176
174
|
|
|
177
|
-
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt':
|
|
175
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
|
|
178
176
|
|
|
179
177
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
180
178
|
# Get the gold choice
|
|
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
|
|
|
195
193
|
if eval_type == EvalType.CHECKPOINT:
|
|
196
194
|
return result
|
|
197
195
|
elif eval_type == EvalType.SERVICE:
|
|
198
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
196
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
199
197
|
elif eval_type == EvalType.CUSTOM:
|
|
200
|
-
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
198
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
201
199
|
else:
|
|
202
200
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
203
201
|
|
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
import glob
|
|
4
4
|
import json
|
|
5
5
|
import os
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
|
|
7
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.metrics.
|
|
9
|
+
from evalscope.constants import AnswerKeys
|
|
10
|
+
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
10
11
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
@@ -19,12 +20,12 @@ logger = get_logger()
|
|
|
19
20
|
name='competition_math',
|
|
20
21
|
dataset_id='modelscope/competition_math',
|
|
21
22
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
|
-
subset_list=['
|
|
23
|
-
metric_list=[
|
|
23
|
+
subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
|
|
24
|
+
metric_list=['AveragePass@1'],
|
|
24
25
|
few_shot_num=4,
|
|
25
26
|
train_split='train',
|
|
26
27
|
eval_split='test',
|
|
27
|
-
prompt_template='
|
|
28
|
+
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
28
29
|
)
|
|
29
30
|
class CompetitionMathAdapter(DataAdapter):
|
|
30
31
|
""" To be tested for all models. """
|
|
@@ -39,8 +40,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
39
40
|
|
|
40
41
|
super().__init__(**kwargs)
|
|
41
42
|
|
|
43
|
+
def load(self, **kwargs):
|
|
44
|
+
# default load all levels
|
|
45
|
+
kwargs['subset_list'] = ['default']
|
|
46
|
+
return super().load(**kwargs)
|
|
47
|
+
|
|
42
48
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
43
|
-
data_dict
|
|
49
|
+
data_dict = defaultdict(dict)
|
|
44
50
|
for subset_name in subset_list:
|
|
45
51
|
for split_name in [self.train_split, self.eval_split]:
|
|
46
52
|
if os.path.exists(dataset_name_or_path):
|
|
@@ -53,13 +59,25 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
53
59
|
if os.path.exists(file_path):
|
|
54
60
|
with open(file_path, 'r') as f:
|
|
55
61
|
split_data.append(json.load(f))
|
|
56
|
-
|
|
57
|
-
data_dict[subset_name].update({split_name: split_data})
|
|
58
|
-
else:
|
|
59
|
-
data_dict[subset_name] = {split_name: split_data}
|
|
62
|
+
data_dict[subset_name][split_name] = split_data
|
|
60
63
|
|
|
61
64
|
return data_dict
|
|
62
65
|
|
|
66
|
+
def gen_prompts(self, data_dict: dict) -> dict:
|
|
67
|
+
res_dict: dict = defaultdict(list)
|
|
68
|
+
|
|
69
|
+
# use level as subset
|
|
70
|
+
for sub_name, sub_data_dict in data_dict.items():
|
|
71
|
+
for sample_d in sub_data_dict[self.eval_split]:
|
|
72
|
+
level = sample_d['level']
|
|
73
|
+
if level not in self.subset_list:
|
|
74
|
+
continue
|
|
75
|
+
prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=None)
|
|
76
|
+
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
|
|
77
|
+
res_dict[level].append(prompt_d)
|
|
78
|
+
|
|
79
|
+
return res_dict
|
|
80
|
+
|
|
63
81
|
def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
|
|
64
82
|
"""
|
|
65
83
|
Generate the prompt for the model input.
|
|
@@ -75,13 +93,13 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
75
93
|
{'data': [prompt]}
|
|
76
94
|
"""
|
|
77
95
|
use_fewshot = self.few_shot_num > 0
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return {'data': [full_prompt], 'system_prompt': self.
|
|
96
|
+
query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
|
|
97
|
+
full_prompt = self.prompt_template.format(query=query)
|
|
98
|
+
return {'data': [full_prompt], 'system_prompt': self.system_prompt}
|
|
81
99
|
|
|
82
100
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
101
|
# Extract the gold answer from the input dict.
|
|
84
|
-
return
|
|
102
|
+
return strip_answer_string(extract_answer(input_d['solution']))
|
|
85
103
|
|
|
86
104
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
87
105
|
"""
|
|
@@ -96,18 +114,11 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
96
114
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
97
115
|
"""
|
|
98
116
|
# Note: Use same extraction method for both of checkpoint/service/custom
|
|
99
|
-
|
|
100
|
-
result = remove_boxed(last_boxed_only_string(result))
|
|
101
|
-
except Exception:
|
|
102
|
-
return None
|
|
117
|
+
result = strip_answer_string(extract_answer(result))
|
|
103
118
|
return result
|
|
104
119
|
|
|
105
120
|
def match(self, gold: str, pred: str) -> float:
|
|
106
|
-
|
|
107
|
-
if is_equiv(pred, gold):
|
|
108
|
-
res = 1
|
|
109
|
-
|
|
110
|
-
return res
|
|
121
|
+
return math_equal(pred, gold)
|
|
111
122
|
|
|
112
123
|
@classmethod
|
|
113
124
|
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
import os.path
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import Any, List, Optional
|
|
5
|
+
from typing import Any, List, Optional, Union
|
|
6
6
|
|
|
7
7
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics.named_metrics import metric_registry
|
|
9
9
|
from evalscope.report import Report, ReportGenerator
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
@@ -16,12 +16,14 @@ class DataAdapter(ABC):
|
|
|
16
16
|
|
|
17
17
|
def __init__(self,
|
|
18
18
|
name: str,
|
|
19
|
+
dataset_id: str,
|
|
19
20
|
subset_list: list,
|
|
20
|
-
metric_list: List[
|
|
21
|
+
metric_list: List[str],
|
|
21
22
|
few_shot_num: Optional[int] = 0,
|
|
22
23
|
train_split: Optional[str] = None,
|
|
23
24
|
eval_split: Optional[str] = None,
|
|
24
25
|
prompt_template: Optional[str] = None,
|
|
26
|
+
system_prompt: Optional[str] = None,
|
|
25
27
|
**kwargs):
|
|
26
28
|
"""
|
|
27
29
|
Data Adapter for the benchmark. You need to implement the following methods:
|
|
@@ -31,6 +33,7 @@ class DataAdapter(ABC):
|
|
|
31
33
|
- match
|
|
32
34
|
Args:
|
|
33
35
|
name: str, the name of the benchmark.
|
|
36
|
+
dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
|
|
34
37
|
subset_list: list of subset names for the dataset.
|
|
35
38
|
metric_list: list, the metric list to evaluate the model on specific benchmark.
|
|
36
39
|
few_shot_num: int, number of few-shot examples. Default: 0
|
|
@@ -41,17 +44,19 @@ class DataAdapter(ABC):
|
|
|
41
44
|
the form of A or B or C or D, do not output explanation:`
|
|
42
45
|
"""
|
|
43
46
|
self.name = name
|
|
47
|
+
self.dataset_id = dataset_id
|
|
44
48
|
self.subset_list = subset_list
|
|
45
49
|
self.metric_list = metric_list
|
|
46
50
|
self.few_shot_num = few_shot_num
|
|
47
51
|
self.train_split = train_split
|
|
48
52
|
self.eval_split = eval_split
|
|
49
53
|
self.prompt_template = prompt_template
|
|
54
|
+
self.system_prompt = system_prompt
|
|
50
55
|
self.config_kwargs = kwargs
|
|
51
56
|
self.category_map = kwargs.get('category_map', {})
|
|
52
57
|
|
|
53
58
|
def load(self,
|
|
54
|
-
dataset_name_or_path: str,
|
|
59
|
+
dataset_name_or_path: str = None,
|
|
55
60
|
subset_list: list = None,
|
|
56
61
|
work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
57
62
|
datasets_hub: str = HubType.MODELSCOPE,
|
|
@@ -64,7 +69,7 @@ class DataAdapter(ABC):
|
|
|
64
69
|
train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
|
|
65
70
|
|
|
66
71
|
"""
|
|
67
|
-
dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
72
|
+
dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
|
|
68
73
|
subset_list = subset_list or self.subset_list
|
|
69
74
|
|
|
70
75
|
# Try to load dataset from local disk
|
|
@@ -156,7 +161,7 @@ class DataAdapter(ABC):
|
|
|
156
161
|
else:
|
|
157
162
|
return data_list[:k]
|
|
158
163
|
|
|
159
|
-
def compute_metric(self, review_res_list: list) -> List[dict]:
|
|
164
|
+
def compute_metric(self, review_res_list: Union[dict, list]) -> List[dict]:
|
|
160
165
|
"""
|
|
161
166
|
Compute evaluation result by specific metrics.
|
|
162
167
|
|
|
@@ -170,14 +175,15 @@ class DataAdapter(ABC):
|
|
|
170
175
|
raise ValueError('No metric list found for the benchmark.')
|
|
171
176
|
|
|
172
177
|
res_list = []
|
|
173
|
-
for
|
|
178
|
+
for metric_str in self.metric_list:
|
|
179
|
+
metric = metric_registry.get(metric_str)
|
|
174
180
|
metric_name = metric.name
|
|
175
181
|
metric_func = metric.object
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
})
|
|
182
|
+
if isinstance(review_res_list, dict):
|
|
183
|
+
review_res = review_res_list.get(metric_name, [])
|
|
184
|
+
else:
|
|
185
|
+
review_res = review_res_list
|
|
186
|
+
res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
|
|
181
187
|
return res_list
|
|
182
188
|
|
|
183
189
|
def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
|