evalscope 0.10.0__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (317) hide show
  1. {evalscope-0.10.0/evalscope.egg-info → evalscope-0.11.0}/PKG-INFO +20 -11
  2. {evalscope-0.10.0 → evalscope-0.11.0}/README.md +7 -6
  3. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/arguments.py +1 -0
  4. evalscope-0.11.0/evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  5. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/arc/arc_adapter.py +5 -7
  6. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  7. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/benchmark.py +2 -2
  8. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  9. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  10. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  11. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/data_adapter.py +18 -12
  12. evalscope-0.11.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope-0.11.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  14. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  15. evalscope-0.11.0/evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  16. evalscope-0.11.0/evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
  17. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  18. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  19. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  20. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
  21. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions.py +3 -4
  22. evalscope-0.11.0/evalscope/benchmarks/iquiz/__init__.py +0 -0
  23. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  24. evalscope-0.11.0/evalscope/benchmarks/math_500/__init__.py +0 -0
  25. evalscope-0.11.0/evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  26. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  27. evalscope-0.11.0/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  28. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  29. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/race_adapter.py +3 -3
  30. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  31. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  32. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_app.py +3 -2
  33. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/evaluator.py +103 -39
  34. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/sampler.py +2 -1
  35. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/schema.py +1 -2
  36. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/config.py +1 -0
  37. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/evaluator.py +78 -64
  38. evalscope-0.11.0/evalscope/metrics/math_parser.py +526 -0
  39. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/metrics.py +16 -1
  40. evalscope-0.11.0/evalscope/metrics/named_metrics.py +41 -0
  41. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/chat_adapter.py +69 -47
  42. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/choice_adapter.py +52 -45
  43. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom_adapter.py +2 -2
  44. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/local_model.py +4 -0
  45. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/server_adapter.py +28 -34
  46. evalscope-0.11.0/evalscope/perf/utils/__init__.py +0 -0
  47. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/app.py +298 -96
  48. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/run.py +10 -7
  49. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/chat_service.py +2 -2
  50. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/io_utils.py +1 -1
  51. evalscope-0.11.0/evalscope/version.py +4 -0
  52. {evalscope-0.10.0 → evalscope-0.11.0/evalscope.egg-info}/PKG-INFO +20 -11
  53. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/SOURCES.txt +12 -2
  54. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/requires.txt +12 -4
  55. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/app.txt +1 -1
  56. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/framework.txt +6 -2
  57. {evalscope-0.10.0 → evalscope-0.11.0}/tests/cli/test_run.py +93 -16
  58. evalscope-0.11.0/tests/rag/__init__.py +0 -0
  59. evalscope-0.10.0/evalscope/benchmarks/ceval/samples.jsonl +0 -1
  60. evalscope-0.10.0/evalscope/metrics/math_accuracy.py +0 -200
  61. evalscope-0.10.0/evalscope/metrics/named_metrics.py +0 -17
  62. evalscope-0.10.0/evalscope/version.py +0 -4
  63. {evalscope-0.10.0 → evalscope-0.11.0}/LICENSE +0 -0
  64. {evalscope-0.10.0 → evalscope-0.11.0}/MANIFEST.in +0 -0
  65. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/__init__.py +0 -0
  66. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/__init__.py +0 -0
  67. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/base.py +0 -0
  68. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/__init__.py +0 -0
  69. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  70. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  71. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  72. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  73. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  74. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  75. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  76. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  77. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  78. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  79. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  80. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  81. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  82. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  83. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  84. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  85. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  86. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  87. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  88. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  89. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  90. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  91. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  92. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  93. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  94. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  95. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  96. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  97. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  98. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  99. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  100. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  101. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  102. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  103. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  104. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  105. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  106. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  107. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  108. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  109. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  110. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  111. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  112. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  113. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  114. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  115. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/__init__.py +0 -0
  116. {evalscope-0.10.0/evalscope/benchmarks/ifeval → evalscope-0.11.0/evalscope/benchmarks/aime24}/__init__.py +0 -0
  117. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  118. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  119. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  120. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  121. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  122. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  123. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  124. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  125. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  126. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  127. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  128. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  129. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  130. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  131. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  132. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  133. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  134. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  135. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  136. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  137. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  138. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  139. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  140. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  141. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  142. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  143. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  144. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  145. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  146. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  147. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  148. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  149. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  150. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  151. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  152. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  153. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  154. {evalscope-0.10.0/evalscope/benchmarks/iquiz → evalscope-0.11.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  155. {evalscope-0.10.0/evalscope/benchmarks/mmlu_pro → evalscope-0.11.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  156. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  157. {evalscope-0.10.0/evalscope/perf/utils → evalscope-0.11.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  158. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  159. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  160. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  161. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  162. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  163. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  164. {evalscope-0.10.0/tests/rag → evalscope-0.11.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  165. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  166. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  167. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  168. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  169. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  170. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  171. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/__init__.py +0 -0
  172. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/race.py +0 -0
  173. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  174. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  175. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  176. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  177. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  178. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  179. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/__init__.py +0 -0
  180. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/base.py +0 -0
  181. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/cli.py +0 -0
  182. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_eval.py +0 -0
  183. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_perf.py +0 -0
  184. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/cli/start_server.py +0 -0
  185. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/collections/__init__.py +0 -0
  186. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/constants.py +0 -0
  187. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/__init__.py +0 -0
  188. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/rating_eval.py +0 -0
  189. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  190. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  191. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/__init__.py +0 -0
  192. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  193. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  194. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/code_metric.py +0 -0
  195. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  196. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  197. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/metrics/rouge_metric.py +0 -0
  198. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/__init__.py +0 -0
  199. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/base_adapter.py +0 -0
  200. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom/__init__.py +0 -0
  201. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom/custom_model.py +0 -0
  202. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/custom/dummy_model.py +0 -0
  203. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/models/model.py +0 -0
  204. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/__init__.py +0 -0
  205. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/arguments.py +0 -0
  206. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/benchmark.py +0 -0
  207. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/http_client.py +0 -0
  208. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/main.py +0 -0
  209. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/__init__.py +0 -0
  210. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  211. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/base.py +0 -0
  212. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
  213. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  214. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
  215. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  216. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  217. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  218. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  219. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  220. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  221. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  222. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  223. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/plugin/registry.py +0 -0
  224. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/analysis_result.py +0 -0
  225. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/benchmark_util.py +0 -0
  226. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/db_util.py +0 -0
  227. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/handler.py +0 -0
  228. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/perf/utils/local_server.py +0 -0
  229. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/__init__.py +0 -0
  230. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  231. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  232. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  233. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  234. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  235. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  236. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  237. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  238. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/data/question.jsonl +0 -0
  239. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/arc.yaml +0 -0
  240. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  241. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  242. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  243. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  244. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  245. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  246. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  247. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  248. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  249. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  250. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/__init__.py +0 -0
  251. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/combinator.py +0 -0
  252. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/generator.py +0 -0
  253. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/report/utils.py +0 -0
  254. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/run_arena.py +0 -0
  255. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/summarizer.py +0 -0
  256. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/__init__.py +0 -0
  257. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/README.md +0 -0
  258. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  259. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  260. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  261. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  262. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  263. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  264. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  265. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  266. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  267. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  268. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  269. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  270. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  271. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  272. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  273. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  274. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  275. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  276. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  277. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  278. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  279. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  280. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  281. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  282. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  283. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/__init__.py +0 -0
  284. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/arena_utils.py +0 -0
  285. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/completion_parsers.py +0 -0
  286. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/logger.py +0 -0
  287. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/model_utils.py +0 -0
  288. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope/utils/utils.py +0 -0
  289. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/dependency_links.txt +0 -0
  290. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/entry_points.txt +0 -0
  291. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/not-zip-safe +0 -0
  292. {evalscope-0.10.0 → evalscope-0.11.0}/evalscope.egg-info/top_level.txt +0 -0
  293. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/docs.txt +0 -0
  294. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/inner.txt +0 -0
  295. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/opencompass.txt +0 -0
  296. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/perf.txt +0 -0
  297. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/rag.txt +0 -0
  298. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/tests.txt +0 -0
  299. {evalscope-0.10.0 → evalscope-0.11.0}/requirements/vlmeval.txt +0 -0
  300. {evalscope-0.10.0 → evalscope-0.11.0}/requirements.txt +0 -0
  301. {evalscope-0.10.0 → evalscope-0.11.0}/setup.cfg +0 -0
  302. {evalscope-0.10.0 → evalscope-0.11.0}/setup.py +0 -0
  303. {evalscope-0.10.0 → evalscope-0.11.0}/tests/__init__.py +0 -0
  304. {evalscope-0.10.0 → evalscope-0.11.0}/tests/cli/__init__.py +0 -0
  305. {evalscope-0.10.0 → evalscope-0.11.0}/tests/cli/test_collection.py +0 -0
  306. {evalscope-0.10.0 → evalscope-0.11.0}/tests/perf/__init__.py +0 -0
  307. {evalscope-0.10.0 → evalscope-0.11.0}/tests/perf/test_perf.py +0 -0
  308. {evalscope-0.10.0 → evalscope-0.11.0}/tests/rag/test_clip_benchmark.py +0 -0
  309. {evalscope-0.10.0 → evalscope-0.11.0}/tests/rag/test_mteb.py +0 -0
  310. {evalscope-0.10.0 → evalscope-0.11.0}/tests/rag/test_ragas.py +0 -0
  311. {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/__init__.py +0 -0
  312. {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/test_run_swift_eval.py +0 -0
  313. {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  314. {evalscope-0.10.0 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  315. {evalscope-0.10.0 → evalscope-0.11.0}/tests/test_run_all.py +0 -0
  316. {evalscope-0.10.0 → evalscope-0.11.0}/tests/vlm/__init__.py +0 -0
  317. {evalscope-0.10.0 → evalscope-0.11.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.10.0
3
+ Version: 0.11.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -19,10 +19,12 @@ License-File: LICENSE
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
- Requires-Dist: datasets<=3.0.1,>=3.0.0
22
+ Requires-Dist: datasets<=3.2.0,>=3.0.0
23
23
  Requires-Dist: editdistance
24
24
  Requires-Dist: jieba
25
25
  Requires-Dist: jsonlines
26
+ Requires-Dist: langdetect
27
+ Requires-Dist: latex2sympy2
26
28
  Requires-Dist: matplotlib
27
29
  Requires-Dist: modelscope[framework]
28
30
  Requires-Dist: nltk>=3.9
@@ -42,12 +44,14 @@ Requires-Dist: scikit-learn
42
44
  Requires-Dist: seaborn
43
45
  Requires-Dist: sentencepiece
44
46
  Requires-Dist: simple-ddl-parser
47
+ Requires-Dist: sympy
45
48
  Requires-Dist: tabulate
46
49
  Requires-Dist: tiktoken
47
50
  Requires-Dist: torch
48
51
  Requires-Dist: tqdm
49
52
  Requires-Dist: transformers>=4.33
50
53
  Requires-Dist: transformers_stream_generator
54
+ Requires-Dist: word2number
51
55
  Provides-Extra: opencompass
52
56
  Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
53
57
  Provides-Extra: vlmeval
@@ -64,7 +68,7 @@ Requires-Dist: sse_starlette; extra == "perf"
64
68
  Requires-Dist: transformers; extra == "perf"
65
69
  Requires-Dist: unicorn; extra == "perf"
66
70
  Provides-Extra: app
67
- Requires-Dist: gradio>=5.4.0; extra == "app"
71
+ Requires-Dist: gradio==5.4.0; extra == "app"
68
72
  Requires-Dist: plotly>=5.23.0; extra == "app"
69
73
  Provides-Extra: inner
70
74
  Requires-Dist: absl-py; extra == "inner"
@@ -96,10 +100,12 @@ Provides-Extra: all
96
100
  Requires-Dist: absl-py; extra == "all"
97
101
  Requires-Dist: accelerate; extra == "all"
98
102
  Requires-Dist: cachetools; extra == "all"
99
- Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
103
+ Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
100
104
  Requires-Dist: editdistance; extra == "all"
101
105
  Requires-Dist: jieba; extra == "all"
102
106
  Requires-Dist: jsonlines; extra == "all"
107
+ Requires-Dist: langdetect; extra == "all"
108
+ Requires-Dist: latex2sympy2; extra == "all"
103
109
  Requires-Dist: matplotlib; extra == "all"
104
110
  Requires-Dist: modelscope[framework]; extra == "all"
105
111
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -119,12 +125,14 @@ Requires-Dist: scikit-learn; extra == "all"
119
125
  Requires-Dist: seaborn; extra == "all"
120
126
  Requires-Dist: sentencepiece; extra == "all"
121
127
  Requires-Dist: simple-ddl-parser; extra == "all"
128
+ Requires-Dist: sympy; extra == "all"
122
129
  Requires-Dist: tabulate; extra == "all"
123
130
  Requires-Dist: tiktoken; extra == "all"
124
131
  Requires-Dist: torch; extra == "all"
125
132
  Requires-Dist: tqdm; extra == "all"
126
133
  Requires-Dist: transformers>=4.33; extra == "all"
127
134
  Requires-Dist: transformers_stream_generator; extra == "all"
135
+ Requires-Dist: word2number; extra == "all"
128
136
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
129
137
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
130
138
  Requires-Dist: mteb==1.19.4; extra == "all"
@@ -136,7 +144,7 @@ Requires-Dist: numpy; extra == "all"
136
144
  Requires-Dist: sse_starlette; extra == "all"
137
145
  Requires-Dist: transformers; extra == "all"
138
146
  Requires-Dist: unicorn; extra == "all"
139
- Requires-Dist: gradio>=5.4.0; extra == "all"
147
+ Requires-Dist: gradio==5.4.0; extra == "all"
140
148
  Requires-Dist: plotly>=5.23.0; extra == "all"
141
149
 
142
150
  <p align="center">
@@ -215,7 +223,8 @@ Please scan the QR code below to join our community groups:
215
223
 
216
224
 
217
225
  ## 🎉 News
218
- - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
226
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
227
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
228
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
229
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
221
230
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
@@ -416,27 +425,27 @@ To create a public link, set `share=True` in `launch()`.
416
425
  <table>
417
426
  <tr>
418
427
  <td style="text-align: center;">
419
- <img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
428
+ <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
420
429
  <p>Setting Interface</p>
421
430
  </td>
422
431
  <td style="text-align: center;">
423
- <img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
432
+ <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
424
433
  <p>Model Comparison</p>
425
434
  </td>
426
435
  </tr>
427
436
  <tr>
428
437
  <td style="text-align: center;">
429
- <img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
438
+ <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
430
439
  <p>Report Overview</p>
431
440
  </td>
432
441
  <td style="text-align: center;">
433
- <img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
442
+ <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
434
443
  <p>Report Details</p>
435
444
  </td>
436
445
  </tr>
437
446
  </table>
438
447
 
439
- For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
448
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
440
449
 
441
450
  ## 🌐 Evaluation of Specified Model API
442
451
 
@@ -74,7 +74,8 @@ Please scan the QR code below to join our community groups:
74
74
 
75
75
 
76
76
  ## 🎉 News
77
- - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
77
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
78
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
78
79
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
79
80
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
80
81
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
@@ -275,27 +276,27 @@ To create a public link, set `share=True` in `launch()`.
275
276
  <table>
276
277
  <tr>
277
278
  <td style="text-align: center;">
278
- <img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
279
+ <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
279
280
  <p>Setting Interface</p>
280
281
  </td>
281
282
  <td style="text-align: center;">
282
- <img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
283
+ <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
283
284
  <p>Model Comparison</p>
284
285
  </td>
285
286
  </tr>
286
287
  <tr>
287
288
  <td style="text-align: center;">
288
- <img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
289
+ <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
289
290
  <p>Report Overview</p>
290
291
  </td>
291
292
  <td style="text-align: center;">
292
- <img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
293
+ <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
293
294
  <p>Report Details</p>
294
295
  </td>
295
296
  </tr>
296
297
  </table>
297
298
 
298
- For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
299
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
299
300
 
300
301
  ## 🌐 Evaluation of Specified Model API
301
302
 
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
58
58
  parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
59
59
  choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
60
60
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
61
+ parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
61
62
 
62
63
  # Cache and working directory arguments
63
64
  parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='aime24',
13
+ dataset_id='HuggingFaceH4/aime_2024',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='train', # Only train set is available
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class AIME24Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['problem']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/ai2_arc',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
- metric_list=[AverageAccuracy],
23
+ metric_list=['AverageAccuracy'],
24
24
  few_shot_num=0,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
112
112
  # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
113
113
  full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
114
114
 
115
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
115
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
116
116
 
117
117
  def get_gold_answer(self, input_d: dict) -> str:
118
118
  # Get the gold choice
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
133
133
  if eval_type == EvalType.CHECKPOINT:
134
134
  return result
135
135
  elif eval_type == EvalType.SERVICE:
136
- return ResponseParser.parse_first_option_with_choices(
137
- text=result, options=self.choices) # TODO: to be checked !
136
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
138
137
  elif eval_type == EvalType.CUSTOM:
139
- return ResponseParser.parse_first_option_with_choices(
140
- text=result, options=self.choices) # TODO: to be checked !
138
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
141
139
  else:
142
140
  raise ValueError(f'Invalid eval_type: {eval_type}')
143
141
 
@@ -7,7 +7,7 @@ import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import AverageAccuracy, exact_match
10
+ from evalscope.metrics import exact_match
11
11
  from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
12
  from evalscope.utils import ResponseParser
13
13
  from evalscope.utils.logger import get_logger
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
63
63
  dataset_id='modelscope/bbh',
64
64
  model_adapter=ChatGenerationModelAdapter,
65
65
  subset_list=SUBSET_LIST,
66
- metric_list=[AverageAccuracy],
66
+ metric_list=['AverageAccuracy'],
67
67
  few_shot_num=3,
68
68
  train_split=None,
69
69
  eval_split='test',
70
- prompt_template='',
70
+ prompt_template="Q: {query}\nA: Let's think step by step.",
71
71
  )
72
72
  class BBHAdapter(DataAdapter):
73
73
  """
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
119
119
  {'data': ['xxx']}
120
120
  """
121
121
  # few_shot_list: should be ['xxxx']
122
- cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
123
- full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
122
+ if len(few_shot_list) > 0:
123
+ cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
124
+ else:
125
+ cot_prompts = ''
126
+ full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
124
127
 
125
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
128
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
126
129
 
127
130
  def gen_prompts(self, data_dict: dict) -> dict:
128
131
  """
@@ -177,9 +180,11 @@ class BBHAdapter(DataAdapter):
177
180
 
178
181
  def get_gold_answer(self, input_d: dict) -> str:
179
182
  # Get the gold choice
180
- gold = input_d.get('target')
183
+ gold = input_d.get('target', '')
184
+ # remove brackets
181
185
  if gold is None:
182
186
  logger.error(f'BBHAdapter: gold is None.')
187
+ gold = gold.replace('(', '').replace(')', '')
183
188
  return gold
184
189
 
185
190
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -228,8 +233,11 @@ class BBHAdapter(DataAdapter):
228
233
  """
229
234
  Extract the answer from the model output for Free-form task.
230
235
  """
231
- res = ResponseParser.parse_first_option(ans)
232
- if res:
236
+ pattern = r'answer is\s+(.*?)\.'
237
+
238
+ match = re.search(pattern, ans)
239
+ if match:
240
+ res = match.group(1)
233
241
  return res
234
242
 
235
243
  ans_line = ans.split('answer is ')
@@ -17,12 +17,13 @@ class BenchmarkMeta:
17
17
  data_adapter: 'DataAdapter'
18
18
  model_adapter: BaseModelAdapter
19
19
  subset_list: List[str] = field(default_factory=list)
20
- metric_list: List[dict] = field(default_factory=list)
20
+ metric_list: List[str] = field(default_factory=list)
21
21
  few_shot_num: int = 0
22
22
  few_shot_random: bool = False
23
23
  train_split: Optional[str] = None
24
24
  eval_split: Optional[str] = None
25
25
  prompt_template: Optional[str] = None
26
+ system_prompt: Optional[str] = None
26
27
 
27
28
  def _update(self, args: dict):
28
29
  if args.get('local_path'):
@@ -40,7 +41,6 @@ class BenchmarkMeta:
40
41
  # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
41
42
  del cur_dict['data_adapter']
42
43
  del cur_dict['model_adapter']
43
- del cur_dict['metric_list']
44
44
  return cur_dict
45
45
 
46
46
  def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -4,10 +4,9 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy
8
- from evalscope.metrics.metrics import exact_match, weighted_mean
7
+ from evalscope.metrics.metrics import exact_match
9
8
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
13
12
  # flake8: noqa
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
130
129
  dataset_id='modelscope/ceval-exam',
131
130
  model_adapter=MultiChoiceModelAdapter,
132
131
  subset_list=SUBSET_LIST,
133
- metric_list=[AverageAccuracy],
132
+ metric_list=['AverageAccuracy'],
134
133
  few_shot_num=0,
135
134
  train_split='dev',
136
135
  eval_split='val',
136
+ prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
137
137
  )
138
138
  class CEVALAdapter(DataAdapter):
139
139
 
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
202
202
  else:
203
203
  context = ''
204
204
 
205
- full_prompt: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
205
+ query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
206
206
 
207
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
208
- full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
208
+ full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
209
209
 
210
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
210
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
211
211
 
212
212
  def get_gold_answer(self, input_d: dict) -> str:
213
213
  # Get the gold choice
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
228
228
  if eval_type == EvalType.CHECKPOINT:
229
229
  return result
230
230
  elif eval_type == EvalType.SERVICE:
231
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
231
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
232
232
  elif eval_type == EvalType.CUSTOM:
233
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
233
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
234
234
  else:
235
235
  raise ValueError(f'Invalid eval_type: {eval_type}')
236
236
 
@@ -5,9 +5,9 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
10
+ from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
12
12
 
13
13
  # flake8: noqa
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
106
106
  dataset_id='modelscope/cmmlu',
107
107
  model_adapter=MultiChoiceModelAdapter,
108
108
  subset_list=SUBSET_LIST,
109
- metric_list=[AverageAccuracy],
109
+ metric_list=['AverageAccuracy'],
110
110
  few_shot_num=5,
111
111
  train_split='dev',
112
112
  eval_split='test',
113
+ prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
113
114
  )
114
115
  class CMMLUAdapter(DataAdapter):
115
116
 
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
165
166
  {'data': [(context, continuation), ...]}
166
167
 
167
168
  """
168
- prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
169
169
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
170
-
171
- context: str = '\n'.join(few_shot_prompts) + '\n'
170
+ context = '\n'.join(few_shot_prompts) + '\n'
172
171
  context += self._generate_prompt(input_d=input_d, include_answer=False)
173
- context = prompt + context
174
172
 
175
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
173
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
176
174
 
177
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
175
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
178
176
 
179
177
  def get_gold_answer(self, input_d: dict) -> str:
180
178
  # Get the gold choice
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
195
193
  if eval_type == EvalType.CHECKPOINT:
196
194
  return result
197
195
  elif eval_type == EvalType.SERVICE:
198
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
196
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
199
197
  elif eval_type == EvalType.CUSTOM:
200
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
198
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
201
199
  else:
202
200
  raise ValueError(f'Invalid eval_type: {eval_type}')
203
201
 
@@ -3,10 +3,11 @@
3
3
  import glob
4
4
  import json
5
5
  import os
6
+ from collections import defaultdict
6
7
 
7
8
  from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.metrics import AverageAccuracy
9
- from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
9
+ from evalscope.constants import AnswerKeys
10
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
10
11
  from evalscope.models import ChatGenerationModelAdapter
11
12
  from evalscope.utils.logger import get_logger
12
13
 
@@ -19,12 +20,12 @@ logger = get_logger()
19
20
  name='competition_math',
20
21
  dataset_id='modelscope/competition_math',
21
22
  model_adapter=ChatGenerationModelAdapter,
22
- subset_list=['default'],
23
- metric_list=[AverageAccuracy],
23
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
24
+ metric_list=['AveragePass@1'],
24
25
  few_shot_num=4,
25
26
  train_split='train',
26
27
  eval_split='test',
27
- prompt_template='Put the final answer in \\boxed{}.',
28
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
28
29
  )
29
30
  class CompetitionMathAdapter(DataAdapter):
30
31
  """ To be tested for all models. """
@@ -39,8 +40,13 @@ class CompetitionMathAdapter(DataAdapter):
39
40
 
40
41
  super().__init__(**kwargs)
41
42
 
43
+ def load(self, **kwargs):
44
+ # default load all levels
45
+ kwargs['subset_list'] = ['default']
46
+ return super().load(**kwargs)
47
+
42
48
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
43
- data_dict: dict = {}
49
+ data_dict = defaultdict(dict)
44
50
  for subset_name in subset_list:
45
51
  for split_name in [self.train_split, self.eval_split]:
46
52
  if os.path.exists(dataset_name_or_path):
@@ -53,13 +59,25 @@ class CompetitionMathAdapter(DataAdapter):
53
59
  if os.path.exists(file_path):
54
60
  with open(file_path, 'r') as f:
55
61
  split_data.append(json.load(f))
56
- if subset_name in data_dict:
57
- data_dict[subset_name].update({split_name: split_data})
58
- else:
59
- data_dict[subset_name] = {split_name: split_data}
62
+ data_dict[subset_name][split_name] = split_data
60
63
 
61
64
  return data_dict
62
65
 
66
+ def gen_prompts(self, data_dict: dict) -> dict:
67
+ res_dict: dict = defaultdict(list)
68
+
69
+ # use level as subset
70
+ for sub_name, sub_data_dict in data_dict.items():
71
+ for sample_d in sub_data_dict[self.eval_split]:
72
+ level = sample_d['level']
73
+ if level not in self.subset_list:
74
+ continue
75
+ prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=None)
76
+ prompt_d[AnswerKeys.RAW_INPUT] = sample_d
77
+ res_dict[level].append(prompt_d)
78
+
79
+ return res_dict
80
+
63
81
  def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
64
82
  """
65
83
  Generate the prompt for the model input.
@@ -75,13 +93,13 @@ class CompetitionMathAdapter(DataAdapter):
75
93
  {'data': [prompt]}
76
94
  """
77
95
  use_fewshot = self.few_shot_num > 0
78
- full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
79
-
80
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
96
+ query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
97
+ full_prompt = self.prompt_template.format(query=query)
98
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
81
99
 
82
100
  def get_gold_answer(self, input_d: dict) -> str:
83
101
  # Extract the gold answer from the input dict.
84
- return remove_boxed(last_boxed_only_string(input_d['solution']))
102
+ return strip_answer_string(extract_answer(input_d['solution']))
85
103
 
86
104
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
87
105
  """
@@ -96,18 +114,11 @@ class CompetitionMathAdapter(DataAdapter):
96
114
  The parsed answer. Depending on the dataset. Usually a string for chat.
97
115
  """
98
116
  # Note: Use same extraction method for both of checkpoint/service/custom
99
- try:
100
- result = remove_boxed(last_boxed_only_string(result))
101
- except Exception:
102
- return None
117
+ result = strip_answer_string(extract_answer(result))
103
118
  return result
104
119
 
105
120
  def match(self, gold: str, pred: str) -> float:
106
- res = 0
107
- if is_equiv(pred, gold):
108
- res = 1
109
-
110
- return res
121
+ return math_equal(pred, gold)
111
122
 
112
123
  @classmethod
113
124
  def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str: