evalscope 0.10.1__tar.gz → 0.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (335) hide show
  1. {evalscope-0.10.1/evalscope.egg-info → evalscope-0.12.0}/PKG-INFO +22 -8
  2. {evalscope-0.10.1 → evalscope-0.12.0}/README.md +5 -1
  3. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/arguments.py +3 -0
  4. evalscope-0.12.0/evalscope/benchmarks/aime/aime24_adapter.py +49 -0
  5. evalscope-0.12.0/evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  6. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/arc/arc_adapter.py +5 -7
  7. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
  8. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/benchmark.py +5 -3
  9. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  10. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  11. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
  12. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/data_adapter.py +88 -29
  13. evalscope-0.12.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  14. evalscope-0.12.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
  15. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
  16. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
  17. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
  18. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  19. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  20. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
  21. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  22. evalscope-0.12.0/evalscope/benchmarks/math_500/__init__.py +0 -0
  23. evalscope-0.12.0/evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
  24. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  25. evalscope-0.12.0/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  26. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
  27. evalscope-0.12.0/evalscope/benchmarks/musr/__init__.py +0 -0
  28. evalscope-0.12.0/evalscope/benchmarks/musr/musr_adapter.py +68 -0
  29. evalscope-0.12.0/evalscope/benchmarks/process_bench/__init__.py +0 -0
  30. evalscope-0.12.0/evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  31. evalscope-0.12.0/evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  32. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/race/race_adapter.py +3 -3
  33. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  34. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
  35. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/cli/start_app.py +4 -1
  36. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/cli/start_eval.py +4 -3
  37. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/cli/start_perf.py +4 -2
  38. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/collections/evaluator.py +109 -39
  39. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/collections/sampler.py +2 -1
  40. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/collections/schema.py +1 -2
  41. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/config.py +4 -1
  42. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/evaluator/evaluator.py +81 -65
  43. evalscope-0.12.0/evalscope/metrics/__init__.py +5 -0
  44. evalscope-0.12.0/evalscope/metrics/math_parser.py +526 -0
  45. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/metrics/metrics.py +39 -3
  46. evalscope-0.12.0/evalscope/metrics/named_metrics.py +41 -0
  47. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/base_adapter.py +7 -1
  48. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/chat_adapter.py +69 -49
  49. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/choice_adapter.py +52 -45
  50. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/custom_adapter.py +2 -2
  51. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/local_model.py +7 -2
  52. evalscope-0.12.0/evalscope/models/server_adapter.py +156 -0
  53. evalscope-0.12.0/evalscope/perf/__init__.py +0 -0
  54. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/arguments.py +5 -1
  55. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/http_client.py +2 -2
  56. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/api/openai_api.py +11 -1
  57. evalscope-0.12.0/evalscope/perf/utils/__init__.py +0 -0
  58. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/utils/benchmark_util.py +6 -2
  59. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/report/app.py +42 -23
  60. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/run.py +11 -8
  61. evalscope-0.12.0/evalscope/third_party/thinkbench/__init__.py +3 -0
  62. evalscope-0.12.0/evalscope/third_party/thinkbench/eval.py +264 -0
  63. evalscope-0.12.0/evalscope/third_party/thinkbench/infer.py +100 -0
  64. evalscope-0.12.0/evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  65. evalscope-0.12.0/evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  66. evalscope-0.12.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  67. evalscope-0.12.0/evalscope/third_party/thinkbench/tools/llm.py +47 -0
  68. evalscope-0.12.0/evalscope/third_party/thinkbench/tools/utils.py +13 -0
  69. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/chat_service.py +2 -2
  70. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/io_utils.py +1 -1
  71. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/model_utils.py +17 -1
  72. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/utils.py +45 -45
  73. evalscope-0.12.0/evalscope/version.py +4 -0
  74. {evalscope-0.10.1 → evalscope-0.12.0/evalscope.egg-info}/PKG-INFO +22 -8
  75. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope.egg-info/SOURCES.txt +23 -2
  76. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope.egg-info/requires.txt +16 -6
  77. evalscope-0.12.0/requirements/app.txt +2 -0
  78. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/framework.txt +7 -2
  79. {evalscope-0.10.1 → evalscope-0.12.0}/tests/cli/test_run.py +108 -19
  80. evalscope-0.12.0/tests/rag/__init__.py +0 -0
  81. evalscope-0.10.1/evalscope/benchmarks/ceval/samples.jsonl +0 -1
  82. evalscope-0.10.1/evalscope/metrics/__init__.py +0 -4
  83. evalscope-0.10.1/evalscope/metrics/math_accuracy.py +0 -200
  84. evalscope-0.10.1/evalscope/metrics/named_metrics.py +0 -17
  85. evalscope-0.10.1/evalscope/models/server_adapter.py +0 -111
  86. evalscope-0.10.1/evalscope/perf/__init__.py +0 -1
  87. evalscope-0.10.1/evalscope/version.py +0 -4
  88. evalscope-0.10.1/requirements/app.txt +0 -2
  89. {evalscope-0.10.1 → evalscope-0.12.0}/LICENSE +0 -0
  90. {evalscope-0.10.1 → evalscope-0.12.0}/MANIFEST.in +0 -0
  91. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/__init__.py +0 -0
  92. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/__init__.py +0 -0
  93. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/base.py +0 -0
  94. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/opencompass/__init__.py +0 -0
  95. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  96. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  97. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  98. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  99. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  100. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  101. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  102. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  103. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  104. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  105. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  106. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  107. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  108. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  109. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  110. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  111. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  112. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  113. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  114. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  115. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  116. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  117. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  118. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  119. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  120. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  121. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  122. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  123. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  124. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  125. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  126. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  127. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  128. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  129. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  130. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  131. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  132. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  133. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  134. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  135. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  136. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  137. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  138. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  139. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  140. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  141. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/__init__.py +0 -0
  142. {evalscope-0.10.1/evalscope/benchmarks/gpqa → evalscope-0.12.0/evalscope/benchmarks/aime}/__init__.py +0 -0
  143. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  144. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  145. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  146. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  147. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  148. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  149. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  150. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  151. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  152. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  153. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  154. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  155. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  156. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  157. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  158. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  159. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  160. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  161. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  162. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  163. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  164. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  165. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  166. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  167. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  168. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  169. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  170. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  171. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  172. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  173. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  174. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  175. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  176. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  177. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  178. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  179. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  180. {evalscope-0.10.1/evalscope/benchmarks/ifeval → evalscope-0.12.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  181. {evalscope-0.10.1/evalscope/benchmarks/iquiz → evalscope-0.12.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  182. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  183. {evalscope-0.10.1/evalscope/benchmarks/mmlu_pro → evalscope-0.12.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  184. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  185. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  186. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  187. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  188. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  189. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  190. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  191. {evalscope-0.10.1/evalscope/perf/utils → evalscope-0.12.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  192. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  193. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  194. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  195. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  196. {evalscope-0.10.1/tests/rag → evalscope-0.12.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  197. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  198. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  199. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  200. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/race/__init__.py +0 -0
  201. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/race/race.py +0 -0
  202. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  203. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  204. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  205. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  206. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  207. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  208. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/cli/__init__.py +0 -0
  209. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/cli/base.py +0 -0
  210. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/cli/cli.py +0 -0
  211. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/cli/start_server.py +0 -0
  212. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/collections/__init__.py +0 -0
  213. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/constants.py +0 -0
  214. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/evaluator/__init__.py +0 -0
  215. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/evaluator/rating_eval.py +0 -0
  216. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  217. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  218. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  219. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  220. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/metrics/code_metric.py +0 -0
  221. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  222. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  223. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/metrics/rouge_metric.py +0 -0
  224. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/__init__.py +0 -0
  225. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/custom/__init__.py +0 -0
  226. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/custom/custom_model.py +0 -0
  227. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/custom/dummy_model.py +0 -0
  228. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/models/model.py +0 -0
  229. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/benchmark.py +0 -0
  230. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/main.py +0 -0
  231. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/__init__.py +0 -0
  232. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  233. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/api/base.py +0 -0
  234. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
  235. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  236. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  237. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  238. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  239. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  240. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  241. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  242. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  243. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  244. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/plugin/registry.py +0 -0
  245. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/utils/analysis_result.py +0 -0
  246. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/utils/db_util.py +0 -0
  247. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/utils/handler.py +0 -0
  248. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/perf/utils/local_server.py +0 -0
  249. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/__init__.py +0 -0
  250. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  251. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  252. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  253. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  254. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  255. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  256. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  257. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  258. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/data/question.jsonl +0 -0
  259. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/arc.yaml +0 -0
  260. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  261. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  262. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  263. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  264. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  265. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  266. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  267. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  268. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  269. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  270. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/report/__init__.py +0 -0
  271. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/report/combinator.py +0 -0
  272. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/report/generator.py +0 -0
  273. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/report/utils.py +0 -0
  274. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/run_arena.py +0 -0
  275. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/summarizer.py +0 -0
  276. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/__init__.py +0 -0
  277. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/README.md +0 -0
  278. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  279. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  280. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  281. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  282. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  283. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  284. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  285. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  286. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  287. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  288. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  289. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  290. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  291. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  292. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  293. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  294. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  295. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  296. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  297. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  298. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  299. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  300. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  301. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  302. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  303. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/__init__.py +0 -0
  304. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/arena_utils.py +0 -0
  305. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/completion_parsers.py +0 -0
  306. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope/utils/logger.py +0 -0
  307. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope.egg-info/dependency_links.txt +0 -0
  308. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope.egg-info/entry_points.txt +0 -0
  309. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope.egg-info/not-zip-safe +0 -0
  310. {evalscope-0.10.1 → evalscope-0.12.0}/evalscope.egg-info/top_level.txt +0 -0
  311. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/docs.txt +0 -0
  312. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/inner.txt +0 -0
  313. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/opencompass.txt +0 -0
  314. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/perf.txt +0 -0
  315. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/rag.txt +0 -0
  316. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/tests.txt +0 -0
  317. {evalscope-0.10.1 → evalscope-0.12.0}/requirements/vlmeval.txt +0 -0
  318. {evalscope-0.10.1 → evalscope-0.12.0}/requirements.txt +0 -0
  319. {evalscope-0.10.1 → evalscope-0.12.0}/setup.cfg +0 -0
  320. {evalscope-0.10.1 → evalscope-0.12.0}/setup.py +0 -0
  321. {evalscope-0.10.1 → evalscope-0.12.0}/tests/__init__.py +0 -0
  322. {evalscope-0.10.1 → evalscope-0.12.0}/tests/cli/__init__.py +0 -0
  323. {evalscope-0.10.1 → evalscope-0.12.0}/tests/cli/test_collection.py +0 -0
  324. {evalscope-0.10.1 → evalscope-0.12.0}/tests/perf/__init__.py +0 -0
  325. {evalscope-0.10.1 → evalscope-0.12.0}/tests/perf/test_perf.py +0 -0
  326. {evalscope-0.10.1 → evalscope-0.12.0}/tests/rag/test_clip_benchmark.py +0 -0
  327. {evalscope-0.10.1 → evalscope-0.12.0}/tests/rag/test_mteb.py +0 -0
  328. {evalscope-0.10.1 → evalscope-0.12.0}/tests/rag/test_ragas.py +0 -0
  329. {evalscope-0.10.1 → evalscope-0.12.0}/tests/swift/__init__.py +0 -0
  330. {evalscope-0.10.1 → evalscope-0.12.0}/tests/swift/test_run_swift_eval.py +0 -0
  331. {evalscope-0.10.1 → evalscope-0.12.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  332. {evalscope-0.10.1 → evalscope-0.12.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  333. {evalscope-0.10.1 → evalscope-0.12.0}/tests/test_run_all.py +0 -0
  334. {evalscope-0.10.1 → evalscope-0.12.0}/tests/vlm/__init__.py +0 -0
  335. {evalscope-0.10.1 → evalscope-0.12.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.10.1
3
+ Version: 0.12.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -19,10 +19,13 @@ License-File: LICENSE
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
- Requires-Dist: datasets<=3.0.1,>=3.0.0
22
+ Requires-Dist: datasets<=3.2.0,>=3.0.0
23
23
  Requires-Dist: editdistance
24
+ Requires-Dist: immutabledict
24
25
  Requires-Dist: jieba
25
26
  Requires-Dist: jsonlines
27
+ Requires-Dist: langdetect
28
+ Requires-Dist: latex2sympy2
26
29
  Requires-Dist: matplotlib
27
30
  Requires-Dist: modelscope[framework]
28
31
  Requires-Dist: nltk>=3.9
@@ -42,12 +45,14 @@ Requires-Dist: scikit-learn
42
45
  Requires-Dist: seaborn
43
46
  Requires-Dist: sentencepiece
44
47
  Requires-Dist: simple-ddl-parser
48
+ Requires-Dist: sympy
45
49
  Requires-Dist: tabulate
46
50
  Requires-Dist: tiktoken
47
51
  Requires-Dist: torch
48
52
  Requires-Dist: tqdm
49
53
  Requires-Dist: transformers>=4.33
50
54
  Requires-Dist: transformers_stream_generator
55
+ Requires-Dist: word2number
51
56
  Provides-Extra: opencompass
52
57
  Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
53
58
  Provides-Extra: vlmeval
@@ -64,8 +69,8 @@ Requires-Dist: sse_starlette; extra == "perf"
64
69
  Requires-Dist: transformers; extra == "perf"
65
70
  Requires-Dist: unicorn; extra == "perf"
66
71
  Provides-Extra: app
67
- Requires-Dist: gradio>=5.4.0; extra == "app"
68
- Requires-Dist: plotly>=5.23.0; extra == "app"
72
+ Requires-Dist: gradio==5.4.0; extra == "app"
73
+ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
69
74
  Provides-Extra: inner
70
75
  Requires-Dist: absl-py; extra == "inner"
71
76
  Requires-Dist: accelerate; extra == "inner"
@@ -96,10 +101,13 @@ Provides-Extra: all
96
101
  Requires-Dist: absl-py; extra == "all"
97
102
  Requires-Dist: accelerate; extra == "all"
98
103
  Requires-Dist: cachetools; extra == "all"
99
- Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
104
+ Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
100
105
  Requires-Dist: editdistance; extra == "all"
106
+ Requires-Dist: immutabledict; extra == "all"
101
107
  Requires-Dist: jieba; extra == "all"
102
108
  Requires-Dist: jsonlines; extra == "all"
109
+ Requires-Dist: langdetect; extra == "all"
110
+ Requires-Dist: latex2sympy2; extra == "all"
103
111
  Requires-Dist: matplotlib; extra == "all"
104
112
  Requires-Dist: modelscope[framework]; extra == "all"
105
113
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -119,12 +127,14 @@ Requires-Dist: scikit-learn; extra == "all"
119
127
  Requires-Dist: seaborn; extra == "all"
120
128
  Requires-Dist: sentencepiece; extra == "all"
121
129
  Requires-Dist: simple-ddl-parser; extra == "all"
130
+ Requires-Dist: sympy; extra == "all"
122
131
  Requires-Dist: tabulate; extra == "all"
123
132
  Requires-Dist: tiktoken; extra == "all"
124
133
  Requires-Dist: torch; extra == "all"
125
134
  Requires-Dist: tqdm; extra == "all"
126
135
  Requires-Dist: transformers>=4.33; extra == "all"
127
136
  Requires-Dist: transformers_stream_generator; extra == "all"
137
+ Requires-Dist: word2number; extra == "all"
128
138
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
129
139
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
130
140
  Requires-Dist: mteb==1.19.4; extra == "all"
@@ -136,8 +146,8 @@ Requires-Dist: numpy; extra == "all"
136
146
  Requires-Dist: sse_starlette; extra == "all"
137
147
  Requires-Dist: transformers; extra == "all"
138
148
  Requires-Dist: unicorn; extra == "all"
139
- Requires-Dist: gradio>=5.4.0; extra == "all"
140
- Requires-Dist: plotly>=5.23.0; extra == "all"
149
+ Requires-Dist: gradio==5.4.0; extra == "all"
150
+ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
141
151
 
142
152
  <p align="center">
143
153
  <br>
@@ -215,6 +225,10 @@ Please scan the QR code below to join our community groups:
215
225
 
216
226
 
217
227
  ## 🎉 News
228
+ - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
229
+ - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
230
+ - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
231
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
218
232
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
233
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
234
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -451,7 +465,7 @@ Then, you can use the following command to evaluate the model API service:
451
465
  ```shell
452
466
  evalscope eval \
453
467
  --model qwen2.5 \
454
- --api-url http://127.0.0.1:8801/v1/chat/completions \
468
+ --api-url http://127.0.0.1:8801/v1 \
455
469
  --api-key EMPTY \
456
470
  --eval-type service \
457
471
  --datasets gsm8k \
@@ -74,6 +74,10 @@ Please scan the QR code below to join our community groups:
74
74
 
75
75
 
76
76
  ## 🎉 News
77
+ - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
78
+ - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
79
+ - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
80
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
77
81
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
78
82
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
79
83
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -310,7 +314,7 @@ Then, you can use the following command to evaluate the model API service:
310
314
  ```shell
311
315
  evalscope eval \
312
316
  --model qwen2.5 \
313
- --api-url http://127.0.0.1:8801/v1/chat/completions \
317
+ --api-url http://127.0.0.1:8801/v1 \
314
318
  --api-key EMPTY \
315
319
  --eval-type service \
316
320
  --datasets gsm8k \
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
58
58
  parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
59
59
  choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
60
60
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
61
+ parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
61
62
 
62
63
  # Cache and working directory arguments
63
64
  parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
@@ -70,6 +71,8 @@ def add_argument(parser: argparse.ArgumentParser):
70
71
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
71
72
  parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
72
73
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
74
+ parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
75
+ parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
73
76
  # yapf: enable
74
77
 
75
78
 
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='aime24',
13
+ dataset_id='HuggingFaceH4/aime_2024',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='train', # Only train set is available
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class AIME24Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['problem']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='aime25',
13
+ dataset_id='TIGER-Lab/AIME25',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='train', # Only train set is available
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class AIME25Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['question']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/ai2_arc',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
- metric_list=[AverageAccuracy],
23
+ metric_list=['AverageAccuracy'],
24
24
  few_shot_num=0,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
112
112
  # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
113
113
  full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
114
114
 
115
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
115
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
116
116
 
117
117
  def get_gold_answer(self, input_d: dict) -> str:
118
118
  # Get the gold choice
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
133
133
  if eval_type == EvalType.CHECKPOINT:
134
134
  return result
135
135
  elif eval_type == EvalType.SERVICE:
136
- return ResponseParser.parse_first_option_with_choices(
137
- text=result, options=self.choices) # TODO: to be checked !
136
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
138
137
  elif eval_type == EvalType.CUSTOM:
139
- return ResponseParser.parse_first_option_with_choices(
140
- text=result, options=self.choices) # TODO: to be checked !
138
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
141
139
  else:
142
140
  raise ValueError(f'Invalid eval_type: {eval_type}')
143
141
 
@@ -7,7 +7,7 @@ import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import AverageAccuracy, exact_match
10
+ from evalscope.metrics import exact_match
11
11
  from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
12
  from evalscope.utils import ResponseParser
13
13
  from evalscope.utils.logger import get_logger
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
63
63
  dataset_id='modelscope/bbh',
64
64
  model_adapter=ChatGenerationModelAdapter,
65
65
  subset_list=SUBSET_LIST,
66
- metric_list=[AverageAccuracy],
66
+ metric_list=['AverageAccuracy'],
67
67
  few_shot_num=3,
68
68
  train_split=None,
69
69
  eval_split='test',
70
- prompt_template='',
70
+ prompt_template="Q: {query}\nA: Let's think step by step.",
71
71
  )
72
72
  class BBHAdapter(DataAdapter):
73
73
  """
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
119
119
  {'data': ['xxx']}
120
120
  """
121
121
  # few_shot_list: should be ['xxxx']
122
- cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
123
- full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
122
+ if len(few_shot_list) > 0:
123
+ cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
124
+ else:
125
+ cot_prompts = ''
126
+ full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
124
127
 
125
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
128
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
126
129
 
127
130
  def gen_prompts(self, data_dict: dict) -> dict:
128
131
  """
@@ -168,18 +171,15 @@ class BBHAdapter(DataAdapter):
168
171
  prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
169
172
  res_dict[sub_name].append(prompt_d)
170
173
 
171
- rnd = random.Random()
172
- rnd.seed(42)
173
- for k, v in res_dict.items():
174
- rnd.shuffle(v)
175
-
176
174
  return res_dict
177
175
 
178
176
  def get_gold_answer(self, input_d: dict) -> str:
179
177
  # Get the gold choice
180
- gold = input_d.get('target')
178
+ gold = input_d.get('target', '')
179
+ # remove brackets
181
180
  if gold is None:
182
181
  logger.error(f'BBHAdapter: gold is None.')
182
+ gold = gold.replace('(', '').replace(')', '')
183
183
  return gold
184
184
 
185
185
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -228,8 +228,11 @@ class BBHAdapter(DataAdapter):
228
228
  """
229
229
  Extract the answer from the model output for Free-form task.
230
230
  """
231
- res = ResponseParser.parse_first_option(ans)
232
- if res:
231
+ pattern = r'answer is\s+(.*?)\.'
232
+
233
+ match = re.search(pattern, ans)
234
+ if match:
235
+ res = match.group(1)
233
236
  return res
234
237
 
235
238
  ans_line = ans.split('answer is ')
@@ -17,12 +17,15 @@ class BenchmarkMeta:
17
17
  data_adapter: 'DataAdapter'
18
18
  model_adapter: BaseModelAdapter
19
19
  subset_list: List[str] = field(default_factory=list)
20
- metric_list: List[dict] = field(default_factory=list)
20
+ metric_list: List[str] = field(default_factory=list)
21
21
  few_shot_num: int = 0
22
22
  few_shot_random: bool = False
23
23
  train_split: Optional[str] = None
24
24
  eval_split: Optional[str] = None
25
25
  prompt_template: Optional[str] = None
26
+ system_prompt: Optional[str] = None
27
+ query_template: Optional[str] = None
28
+ pretty_name: Optional[str] = None
26
29
 
27
30
  def _update(self, args: dict):
28
31
  if args.get('local_path'):
@@ -40,7 +43,6 @@ class BenchmarkMeta:
40
43
  # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
41
44
  del cur_dict['data_adapter']
42
45
  del cur_dict['model_adapter']
43
- del cur_dict['metric_list']
44
46
  return cur_dict
45
47
 
46
48
  def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -59,7 +61,7 @@ class Benchmark:
59
61
  @classmethod
60
62
  def get(cls, name: str) -> 'BenchmarkMeta':
61
63
  if name not in BENCHMARK_MAPPINGS:
62
- raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
64
+ raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
63
65
  benchmark = BENCHMARK_MAPPINGS[name]
64
66
  return benchmark
65
67
 
@@ -4,10 +4,9 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy
8
- from evalscope.metrics.metrics import exact_match, weighted_mean
7
+ from evalscope.metrics.metrics import exact_match
9
8
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
13
12
  # flake8: noqa
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
130
129
  dataset_id='modelscope/ceval-exam',
131
130
  model_adapter=MultiChoiceModelAdapter,
132
131
  subset_list=SUBSET_LIST,
133
- metric_list=[AverageAccuracy],
132
+ metric_list=['AverageAccuracy'],
134
133
  few_shot_num=0,
135
134
  train_split='dev',
136
135
  eval_split='val',
136
+ prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
137
137
  )
138
138
  class CEVALAdapter(DataAdapter):
139
139
 
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
202
202
  else:
203
203
  context = ''
204
204
 
205
- full_prompt: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
205
+ query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
206
206
 
207
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
208
- full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
208
+ full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
209
209
 
210
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
210
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
211
211
 
212
212
  def get_gold_answer(self, input_d: dict) -> str:
213
213
  # Get the gold choice
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
228
228
  if eval_type == EvalType.CHECKPOINT:
229
229
  return result
230
230
  elif eval_type == EvalType.SERVICE:
231
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
231
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
232
232
  elif eval_type == EvalType.CUSTOM:
233
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
233
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
234
234
  else:
235
235
  raise ValueError(f'Invalid eval_type: {eval_type}')
236
236
 
@@ -5,9 +5,9 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
10
+ from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
12
12
 
13
13
  # flake8: noqa
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
106
106
  dataset_id='modelscope/cmmlu',
107
107
  model_adapter=MultiChoiceModelAdapter,
108
108
  subset_list=SUBSET_LIST,
109
- metric_list=[AverageAccuracy],
109
+ metric_list=['AverageAccuracy'],
110
110
  few_shot_num=5,
111
111
  train_split='dev',
112
112
  eval_split='test',
113
+ prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
113
114
  )
114
115
  class CMMLUAdapter(DataAdapter):
115
116
 
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
165
166
  {'data': [(context, continuation), ...]}
166
167
 
167
168
  """
168
- prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
169
169
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
170
-
171
- context: str = '\n'.join(few_shot_prompts) + '\n'
170
+ context = '\n'.join(few_shot_prompts) + '\n'
172
171
  context += self._generate_prompt(input_d=input_d, include_answer=False)
173
- context = prompt + context
174
172
 
175
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
173
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
176
174
 
177
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
175
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
178
176
 
179
177
  def get_gold_answer(self, input_d: dict) -> str:
180
178
  # Get the gold choice
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
195
193
  if eval_type == EvalType.CHECKPOINT:
196
194
  return result
197
195
  elif eval_type == EvalType.SERVICE:
198
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
196
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
199
197
  elif eval_type == EvalType.CUSTOM:
200
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
198
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
201
199
  else:
202
200
  raise ValueError(f'Invalid eval_type: {eval_type}')
203
201
 
@@ -3,10 +3,11 @@
3
3
  import glob
4
4
  import json
5
5
  import os
6
+ from collections import defaultdict
6
7
 
7
8
  from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.metrics import AverageAccuracy
9
- from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
9
+ from evalscope.constants import AnswerKeys
10
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
10
11
  from evalscope.models import ChatGenerationModelAdapter
11
12
  from evalscope.utils.logger import get_logger
12
13
 
@@ -19,12 +20,12 @@ logger = get_logger()
19
20
  name='competition_math',
20
21
  dataset_id='modelscope/competition_math',
21
22
  model_adapter=ChatGenerationModelAdapter,
22
- subset_list=['default'],
23
- metric_list=[AverageAccuracy],
23
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
24
+ metric_list=['AveragePass@1'],
24
25
  few_shot_num=4,
25
- train_split='train',
26
+ train_split=None,
26
27
  eval_split='test',
27
- prompt_template='Put the final answer in \\boxed{}.',
28
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
28
29
  )
29
30
  class CompetitionMathAdapter(DataAdapter):
30
31
  """ To be tested for all models. """
@@ -39,8 +40,14 @@ class CompetitionMathAdapter(DataAdapter):
39
40
 
40
41
  super().__init__(**kwargs)
41
42
 
43
+ def load(self, **kwargs):
44
+ # default load all levels
45
+ kwargs['subset_list'] = ['default']
46
+ data_dict = super().load(**kwargs)
47
+ return self.reformat_subset(data_dict, subset_key='level')
48
+
42
49
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
43
- data_dict: dict = {}
50
+ data_dict = defaultdict(dict)
44
51
  for subset_name in subset_list:
45
52
  for split_name in [self.train_split, self.eval_split]:
46
53
  if os.path.exists(dataset_name_or_path):
@@ -53,10 +60,7 @@ class CompetitionMathAdapter(DataAdapter):
53
60
  if os.path.exists(file_path):
54
61
  with open(file_path, 'r') as f:
55
62
  split_data.append(json.load(f))
56
- if subset_name in data_dict:
57
- data_dict[subset_name].update({split_name: split_data})
58
- else:
59
- data_dict[subset_name] = {split_name: split_data}
63
+ data_dict[subset_name][split_name] = split_data
60
64
 
61
65
  return data_dict
62
66
 
@@ -75,13 +79,13 @@ class CompetitionMathAdapter(DataAdapter):
75
79
  {'data': [prompt]}
76
80
  """
77
81
  use_fewshot = self.few_shot_num > 0
78
- full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
79
-
80
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
82
+ query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
83
+ full_prompt = self.prompt_template.format(query=query)
84
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
81
85
 
82
86
  def get_gold_answer(self, input_d: dict) -> str:
83
87
  # Extract the gold answer from the input dict.
84
- return remove_boxed(last_boxed_only_string(input_d['solution']))
88
+ return strip_answer_string(extract_answer(input_d['solution']))
85
89
 
86
90
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
87
91
  """
@@ -96,18 +100,11 @@ class CompetitionMathAdapter(DataAdapter):
96
100
  The parsed answer. Depending on the dataset. Usually a string for chat.
97
101
  """
98
102
  # Note: Use same extraction method for both of checkpoint/service/custom
99
- try:
100
- result = remove_boxed(last_boxed_only_string(result))
101
- except Exception:
102
- return None
103
+ result = strip_answer_string(extract_answer(result))
103
104
  return result
104
105
 
105
106
  def match(self, gold: str, pred: str) -> float:
106
- res = 0
107
- if is_equiv(pred, gold):
108
- res = 1
109
-
110
- return res
107
+ return math_equal(pred, gold)
111
108
 
112
109
  @classmethod
113
110
  def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str: