evalscope 0.12.0__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (353) hide show
  1. {evalscope-0.12.0/evalscope.egg-info → evalscope-0.13.0}/PKG-INFO +31 -12
  2. {evalscope-0.12.0 → evalscope-0.13.0}/README.md +30 -11
  3. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/arguments.py +6 -1
  4. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  5. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  6. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/arc/arc_adapter.py +15 -18
  7. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  8. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/benchmark.py +12 -11
  9. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
  10. evalscope-0.13.0/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  11. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
  12. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  13. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/data_adapter.py +59 -21
  14. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  15. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  16. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
  17. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  18. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  19. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
  20. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
  21. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  22. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  23. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  24. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  25. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  26. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  27. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  28. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  29. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  30. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  31. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  32. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
  33. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  34. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/musr/musr_adapter.py +8 -5
  35. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  36. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/race/race_adapter.py +12 -16
  37. evalscope-0.13.0/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
  38. evalscope-0.13.0/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  39. evalscope-0.13.0/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  40. evalscope-0.13.0/evalscope/benchmarks/super_gpqa/utils.py +85 -0
  41. evalscope-0.13.0/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  42. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  43. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  44. evalscope-0.13.0/evalscope/benchmarks/utils.py +43 -0
  45. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/collections/evaluator.py +14 -5
  46. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/config.py +15 -2
  47. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/constants.py +14 -0
  48. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/evaluator/evaluator.py +51 -13
  49. evalscope-0.13.0/evalscope/metrics/llm_judge.py +104 -0
  50. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/named_metrics.py +1 -0
  51. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/__init__.py +2 -1
  52. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/base_adapter.py +25 -5
  53. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/chat_adapter.py +3 -0
  54. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/choice_adapter.py +4 -0
  55. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/custom_adapter.py +2 -0
  56. evalscope-0.13.0/evalscope/models/register.py +28 -0
  57. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/server_adapter.py +35 -8
  58. evalscope-0.13.0/evalscope/perf/__init__.py +0 -0
  59. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/arguments.py +13 -7
  60. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/benchmark.py +5 -0
  61. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/http_client.py +15 -5
  62. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/main.py +1 -0
  63. evalscope-0.13.0/evalscope/perf/utils/__init__.py +0 -0
  64. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/utils/analysis_result.py +1 -1
  65. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/report/app.py +3 -0
  66. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/report/combinator.py +2 -2
  67. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/run.py +6 -5
  68. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/infer.py +1 -1
  69. evalscope-0.13.0/evalscope/third_party/thinkbench/eval.py +429 -0
  70. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/thinkbench/infer.py +37 -7
  71. evalscope-0.13.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  72. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/thinkbench/tools/llm.py +1 -0
  73. evalscope-0.13.0/evalscope/third_party/toolbench_static/llm/swift_infer.py +67 -0
  74. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/chat_service.py +1 -0
  75. evalscope-0.13.0/evalscope/utils/filters.py +59 -0
  76. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/logger.py +3 -3
  77. evalscope-0.13.0/evalscope/version.py +4 -0
  78. {evalscope-0.12.0 → evalscope-0.13.0/evalscope.egg-info}/PKG-INFO +31 -12
  79. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope.egg-info/SOURCES.txt +23 -0
  80. evalscope-0.13.0/tests/cli/test_all.py +144 -0
  81. {evalscope-0.12.0 → evalscope-0.13.0}/tests/cli/test_collection.py +28 -2
  82. {evalscope-0.12.0 → evalscope-0.13.0}/tests/cli/test_run.py +201 -32
  83. evalscope-0.13.0/tests/rag/__init__.py +0 -0
  84. evalscope-0.12.0/evalscope/third_party/thinkbench/eval.py +0 -264
  85. evalscope-0.12.0/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -37
  86. evalscope-0.12.0/evalscope/version.py +0 -4
  87. {evalscope-0.12.0 → evalscope-0.13.0}/LICENSE +0 -0
  88. {evalscope-0.12.0 → evalscope-0.13.0}/MANIFEST.in +0 -0
  89. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/__init__.py +0 -0
  90. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/__init__.py +0 -0
  91. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/base.py +0 -0
  92. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/opencompass/__init__.py +0 -0
  93. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  94. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  95. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  96. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  97. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  98. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  99. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  100. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  101. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  102. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  103. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  104. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  105. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  106. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  107. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  108. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  109. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  110. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  111. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  112. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  113. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  114. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  115. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  116. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  117. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  118. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  119. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  120. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  121. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  122. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  123. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  124. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  125. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  126. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  127. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  128. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  129. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  130. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  131. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  132. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  133. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  134. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  135. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  136. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  137. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  138. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  139. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/__init__.py +0 -0
  140. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/aime/__init__.py +0 -0
  141. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  142. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  143. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  144. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  145. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  146. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  147. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  148. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  149. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  150. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  151. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  152. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  153. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  154. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  155. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  156. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  157. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  158. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  159. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  160. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  161. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  162. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  163. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  164. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  165. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  166. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  167. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  168. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  169. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  170. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  171. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  172. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  173. {evalscope-0.12.0/evalscope/benchmarks/data_collection → evalscope-0.13.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  174. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  175. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  176. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  177. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  178. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  179. {evalscope-0.12.0/evalscope/benchmarks/general_mcq → evalscope-0.13.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  180. {evalscope-0.12.0/evalscope/benchmarks/gpqa → evalscope-0.13.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  181. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  182. {evalscope-0.12.0/evalscope/benchmarks/ifeval → evalscope-0.13.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  183. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  184. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  185. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  186. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  187. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  188. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  189. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  190. {evalscope-0.12.0/evalscope/benchmarks/iquiz → evalscope-0.13.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  191. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  192. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  193. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  194. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  195. {evalscope-0.12.0/evalscope/benchmarks/math_500 → evalscope-0.13.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  196. {evalscope-0.12.0/evalscope/benchmarks/mmlu_pro → evalscope-0.13.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  197. {evalscope-0.12.0/evalscope/benchmarks/musr → evalscope-0.13.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
  198. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  199. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  200. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  201. {evalscope-0.12.0/evalscope/benchmarks/process_bench → evalscope-0.13.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  202. {evalscope-0.12.0/evalscope/perf → evalscope-0.13.0/evalscope/benchmarks/musr}/__init__.py +0 -0
  203. {evalscope-0.12.0/evalscope/perf/utils → evalscope-0.13.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  204. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  205. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/race/__init__.py +0 -0
  206. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/race/race.py +0 -0
  207. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  208. {evalscope-0.12.0/evalscope/third_party/thinkbench/tools → evalscope-0.13.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  209. {evalscope-0.12.0/tests/rag → evalscope-0.13.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  210. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  211. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  212. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  213. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  214. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  215. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/cli/__init__.py +0 -0
  216. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/cli/base.py +0 -0
  217. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/cli/cli.py +0 -0
  218. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/cli/start_app.py +0 -0
  219. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/cli/start_eval.py +0 -0
  220. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/cli/start_perf.py +0 -0
  221. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/cli/start_server.py +0 -0
  222. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/collections/__init__.py +0 -0
  223. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/collections/sampler.py +0 -0
  224. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/collections/schema.py +0 -0
  225. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/evaluator/__init__.py +0 -0
  226. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/evaluator/rating_eval.py +0 -0
  227. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  228. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  229. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/__init__.py +0 -0
  230. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  231. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  232. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/code_metric.py +0 -0
  233. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/math_parser.py +0 -0
  234. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/metrics.py +0 -0
  235. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  236. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  237. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/metrics/rouge_metric.py +0 -0
  238. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/custom/__init__.py +0 -0
  239. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/custom/custom_model.py +0 -0
  240. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/custom/dummy_model.py +0 -0
  241. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/local_model.py +0 -0
  242. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/models/model.py +0 -0
  243. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/__init__.py +0 -0
  244. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  245. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/api/base.py +0 -0
  246. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
  247. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  248. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
  249. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  250. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  251. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  252. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  253. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  254. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  255. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  256. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  257. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/plugin/registry.py +0 -0
  258. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/utils/benchmark_util.py +0 -0
  259. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/utils/db_util.py +0 -0
  260. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/utils/handler.py +0 -0
  261. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/perf/utils/local_server.py +0 -0
  262. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/__init__.py +0 -0
  263. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  264. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  265. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  266. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  267. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  268. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  269. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  270. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  271. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/data/question.jsonl +0 -0
  272. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/arc.yaml +0 -0
  273. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  274. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  275. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  276. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  277. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  278. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  279. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  280. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  281. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  282. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  283. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/report/__init__.py +0 -0
  284. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/report/generator.py +0 -0
  285. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/report/utils.py +0 -0
  286. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/run_arena.py +0 -0
  287. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/summarizer.py +0 -0
  288. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/__init__.py +0 -0
  289. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/README.md +0 -0
  290. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  291. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  292. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  293. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  294. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  295. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  296. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  297. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  298. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  299. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  300. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  301. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  302. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  303. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  304. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
  305. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  306. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  307. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  308. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  309. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  310. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  311. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  312. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  313. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  314. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  315. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  316. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  317. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/__init__.py +0 -0
  318. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/arena_utils.py +0 -0
  319. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/completion_parsers.py +0 -0
  320. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/io_utils.py +0 -0
  321. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/model_utils.py +0 -0
  322. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope/utils/utils.py +0 -0
  323. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope.egg-info/dependency_links.txt +0 -0
  324. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope.egg-info/entry_points.txt +0 -0
  325. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope.egg-info/not-zip-safe +0 -0
  326. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope.egg-info/requires.txt +0 -0
  327. {evalscope-0.12.0 → evalscope-0.13.0}/evalscope.egg-info/top_level.txt +0 -0
  328. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/app.txt +0 -0
  329. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/docs.txt +0 -0
  330. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/framework.txt +0 -0
  331. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/inner.txt +0 -0
  332. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/opencompass.txt +0 -0
  333. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/perf.txt +0 -0
  334. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/rag.txt +0 -0
  335. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/tests.txt +0 -0
  336. {evalscope-0.12.0 → evalscope-0.13.0}/requirements/vlmeval.txt +0 -0
  337. {evalscope-0.12.0 → evalscope-0.13.0}/requirements.txt +0 -0
  338. {evalscope-0.12.0 → evalscope-0.13.0}/setup.cfg +0 -0
  339. {evalscope-0.12.0 → evalscope-0.13.0}/setup.py +0 -0
  340. {evalscope-0.12.0 → evalscope-0.13.0}/tests/__init__.py +0 -0
  341. {evalscope-0.12.0 → evalscope-0.13.0}/tests/cli/__init__.py +0 -0
  342. {evalscope-0.12.0 → evalscope-0.13.0}/tests/perf/__init__.py +0 -0
  343. {evalscope-0.12.0 → evalscope-0.13.0}/tests/perf/test_perf.py +0 -0
  344. {evalscope-0.12.0 → evalscope-0.13.0}/tests/rag/test_clip_benchmark.py +0 -0
  345. {evalscope-0.12.0 → evalscope-0.13.0}/tests/rag/test_mteb.py +0 -0
  346. {evalscope-0.12.0 → evalscope-0.13.0}/tests/rag/test_ragas.py +0 -0
  347. {evalscope-0.12.0 → evalscope-0.13.0}/tests/swift/__init__.py +0 -0
  348. {evalscope-0.12.0 → evalscope-0.13.0}/tests/swift/test_run_swift_eval.py +0 -0
  349. {evalscope-0.12.0 → evalscope-0.13.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  350. {evalscope-0.12.0 → evalscope-0.13.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  351. {evalscope-0.12.0 → evalscope-0.13.0}/tests/test_run_all.py +0 -0
  352. {evalscope-0.12.0 → evalscope-0.13.0}/tests/vlm/__init__.py +0 -0
  353. {evalscope-0.12.0 → evalscope-0.13.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.12.0
3
+ Version: 0.13.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -175,16 +175,29 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
175
175
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
176
176
 
177
177
  ## 📋 Contents
178
- - [Introduction](#-introduction)
179
- - [News](#-news)
180
- - [Installation](#️-installation)
181
- - [Quick Start](#-quick-start)
178
+ - [📋 Contents](#-contents)
179
+ - [📝 Introduction](#-introduction)
180
+ - [☎ User Groups](#-user-groups)
181
+ - [🎉 News](#-news)
182
+ - [🛠️ Installation](#️-installation)
183
+ - [Method 1: Install Using pip](#method-1-install-using-pip)
184
+ - [Method 2: Install from Source](#method-2-install-from-source)
185
+ - [🚀 Quick Start](#-quick-start)
186
+ - [Method 1. Using Command Line](#method-1-using-command-line)
187
+ - [Method 2. Using Python Code](#method-2-using-python-code)
188
+ - [Basic Parameter](#basic-parameter)
189
+ - [Output Results](#output-results)
190
+ - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
191
+ - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
192
+ - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
193
+ - [Parameter](#parameter)
182
194
  - [Evaluation Backend](#evaluation-backend)
183
- - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
184
- - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
185
- - [Arena Mode](#-arena-mode)
186
- - [Contribution](#️-contribution)
187
- - [Roadmap](#-roadmap)
195
+ - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
196
+ - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
197
+ - [🏟️ Arena Mode](#️-arena-mode)
198
+ - [👷‍♂️ Contribution](#️-contribution)
199
+ - [🔜 Roadmap](#-roadmap)
200
+ - [Star History](#star-history)
188
201
 
189
202
 
190
203
  ## 📝 Introduction
@@ -225,10 +238,16 @@ Please scan the QR code below to join our community groups:
225
238
 
226
239
 
227
240
  ## 🎉 News
228
- - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
241
+
242
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
243
+ - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
244
+ - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
245
+ - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
246
+ - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
247
+ - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
229
248
  - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
230
249
  - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
231
- - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
250
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
232
251
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
233
252
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
234
253
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -24,16 +24,29 @@
24
24
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
25
25
 
26
26
  ## 📋 Contents
27
- - [Introduction](#-introduction)
28
- - [News](#-news)
29
- - [Installation](#️-installation)
30
- - [Quick Start](#-quick-start)
27
+ - [📋 Contents](#-contents)
28
+ - [📝 Introduction](#-introduction)
29
+ - [☎ User Groups](#-user-groups)
30
+ - [🎉 News](#-news)
31
+ - [🛠️ Installation](#️-installation)
32
+ - [Method 1: Install Using pip](#method-1-install-using-pip)
33
+ - [Method 2: Install from Source](#method-2-install-from-source)
34
+ - [🚀 Quick Start](#-quick-start)
35
+ - [Method 1. Using Command Line](#method-1-using-command-line)
36
+ - [Method 2. Using Python Code](#method-2-using-python-code)
37
+ - [Basic Parameter](#basic-parameter)
38
+ - [Output Results](#output-results)
39
+ - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
40
+ - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
41
+ - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
42
+ - [Parameter](#parameter)
31
43
  - [Evaluation Backend](#evaluation-backend)
32
- - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
33
- - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
34
- - [Arena Mode](#-arena-mode)
35
- - [Contribution](#️-contribution)
36
- - [Roadmap](#-roadmap)
44
+ - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
45
+ - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
46
+ - [🏟️ Arena Mode](#️-arena-mode)
47
+ - [👷‍♂️ Contribution](#️-contribution)
48
+ - [🔜 Roadmap](#-roadmap)
49
+ - [Star History](#star-history)
37
50
 
38
51
 
39
52
  ## 📝 Introduction
@@ -74,10 +87,16 @@ Please scan the QR code below to join our community groups:
74
87
 
75
88
 
76
89
  ## 🎉 News
77
- - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
90
+
91
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
92
+ - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
93
+ - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
94
+ - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
95
+ - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
96
+ - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
78
97
  - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
79
98
  - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
80
- - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
99
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
81
100
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
82
101
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
83
102
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, OutputType
5
5
 
6
6
 
7
7
  class ParseStrArgsAction(argparse.Action):
@@ -73,6 +73,11 @@ def add_argument(parser: argparse.ArgumentParser):
73
73
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
74
74
  parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
75
75
  parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
76
+
77
+ # LLMJudge arguments
78
+ parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
79
+ parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
80
+ parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
76
81
  # yapf: enable
77
82
 
78
83
 
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import OutputType
2
3
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
- from evalscope.models import ChatGenerationModelAdapter
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -10,8 +10,8 @@ logger = get_logger()
10
10
 
11
11
  @Benchmark.register(
12
12
  name='aime24',
13
+ pretty_name='AIME-2024',
13
14
  dataset_id='HuggingFaceH4/aime_2024',
14
- model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
@@ -31,7 +31,7 @@ class AIME24Adapter(DataAdapter):
31
31
  problem = input_d['problem']
32
32
  full_prompt = self.prompt_template.format(query=problem)
33
33
 
34
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
34
+ return self.gen_prompt_data(full_prompt)
35
35
 
36
36
  def get_gold_answer(self, input_d: dict) -> str:
37
37
  # Extract the gold answer from the input dict.
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import OutputType
2
3
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
- from evalscope.models import ChatGenerationModelAdapter
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -10,8 +10,8 @@ logger = get_logger()
10
10
 
11
11
  @Benchmark.register(
12
12
  name='aime25',
13
+ pretty_name='AIME-2025',
13
14
  dataset_id='TIGER-Lab/AIME25',
14
- model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
@@ -31,7 +31,7 @@ class AIME25Adapter(DataAdapter):
31
31
  problem = input_d['question']
32
32
  full_prompt = self.prompt_template.format(query=problem)
33
33
 
34
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
34
+ return self.gen_prompt_data(full_prompt)
35
35
 
36
36
  def get_gold_answer(self, input_d: dict) -> str:
37
37
  # Extract the gold answer from the input dict.
@@ -4,9 +4,8 @@ import json
4
4
  import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import MultiChoiceModelAdapter
10
9
  from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -17,19 +16,20 @@ logger = get_logger()
17
16
 
18
17
  @Benchmark.register(
19
18
  name='arc',
19
+ pretty_name='ARC',
20
20
  dataset_id='modelscope/ai2_arc',
21
- model_adapter=MultiChoiceModelAdapter,
21
+ model_adapter=OutputType.MULTIPLE_CHOICE,
22
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
22
23
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
24
  metric_list=['AverageAccuracy'],
24
25
  few_shot_num=0,
25
26
  train_split='train',
26
27
  eval_split='test',
27
- prompt_template='',
28
+ prompt_template=
29
+ 'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n{query}\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.', # noqa
28
30
  )
29
31
  class ARCAdapter(DataAdapter):
30
32
 
31
- choices = ['A', 'B', 'C', 'D']
32
-
33
33
  def __init__(self, **kwargs):
34
34
  few_shot_num = kwargs.get('few_shot_num', None)
35
35
  if few_shot_num is None:
@@ -42,6 +42,8 @@ class ARCAdapter(DataAdapter):
42
42
 
43
43
  super().__init__(**kwargs)
44
44
 
45
+ self.choices = ['A', 'B', 'C', 'D']
46
+
45
47
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
46
48
  """
47
49
  Load the dataset from local disk.
@@ -60,7 +62,7 @@ class ARCAdapter(DataAdapter):
60
62
  for split_name in ['Train', 'Test']:
61
63
  split_path = os.path.join(subset_path, f'{subset_name}-{split_name}.jsonl')
62
64
  if os.path.exists(split_path):
63
- with open(split_path, 'r', errors='ignore') as in_f:
65
+ with open(split_path, 'r', errors='ignore', encoding='utf-8') as in_f:
64
66
  rows = []
65
67
  for line in in_f:
66
68
  item = json.loads(line.strip())
@@ -107,12 +109,11 @@ class ARCAdapter(DataAdapter):
107
109
  {'data': ['xxx'], 'multi_choices': ['A', 'B', 'C', 'D']}
108
110
  """
109
111
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
110
- context: str = '\n'.join(few_shot_prompts)
112
+ context = '\n'.join(few_shot_prompts) + self._generate_prompt(input_d=input_d, include_answer=False)
111
113
 
112
- # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
113
- full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
114
+ full_prompt = self.prompt_template.format(query=context)
114
115
 
115
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
116
+ return self.gen_prompt_data(full_prompt)
116
117
 
117
118
  def get_gold_answer(self, input_d: dict) -> str:
118
119
  # Get the gold choice
@@ -130,14 +131,10 @@ class ARCAdapter(DataAdapter):
130
131
  Returns:
131
132
  The parsed answer. Depending on the dataset. Usually a string for chat.
132
133
  """
133
- if eval_type == EvalType.CHECKPOINT:
134
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
134
135
  return result
135
- elif eval_type == EvalType.SERVICE:
136
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
137
- elif eval_type == EvalType.CUSTOM:
138
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
139
136
  else:
140
- raise ValueError(f'Invalid eval_type: {eval_type}')
137
+ return ResponseParser.parse_first_option(text=result)
141
138
 
142
139
  def match(self, gold: str, pred: str) -> float:
143
140
  return exact_match(gold=gold, pred=pred)
@@ -152,8 +149,8 @@ class ARCAdapter(DataAdapter):
152
149
  choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
153
150
  example += '\n' + choices_prompts
154
151
 
155
- example += '\nAnswer:'
156
152
  if include_answer:
153
+ example += '\nAnswer:'
157
154
  example += ' {}\n\n'.format(input_d['answerKey'])
158
155
 
159
156
  return example
@@ -8,8 +8,6 @@ import re
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
10
  from evalscope.metrics import exact_match
11
- from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
- from evalscope.utils import ResponseParser
13
11
  from evalscope.utils.logger import get_logger
14
12
 
15
13
  # flake8: noqa
@@ -60,8 +58,8 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
60
58
 
61
59
  @Benchmark.register(
62
60
  name='bbh',
61
+ pretty_name='BBH',
63
62
  dataset_id='modelscope/bbh',
64
- model_adapter=ChatGenerationModelAdapter,
65
63
  subset_list=SUBSET_LIST,
66
64
  metric_list=['AverageAccuracy'],
67
65
  few_shot_num=3,
@@ -94,7 +92,7 @@ class BBHAdapter(DataAdapter):
94
92
  else:
95
93
  file_path: str = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}.json')
96
94
  if os.path.exists(file_path):
97
- with open(file_path, 'r') as f:
95
+ with open(file_path, 'r', encoding='utf-8') as f:
98
96
  examples = json.load(f)['examples']
99
97
  if subset_name in data_dict:
100
98
  data_dict[subset_name].update({split_name: examples})
@@ -125,7 +123,7 @@ class BBHAdapter(DataAdapter):
125
123
  cot_prompts = ''
126
124
  full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
127
125
 
128
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
126
+ return self.gen_prompt_data(full_prompt)
129
127
 
130
128
  def gen_prompts(self, data_dict: dict) -> dict:
131
129
  """
@@ -153,7 +151,9 @@ class BBHAdapter(DataAdapter):
153
151
  for sub_name, sub_data_dict in data_dict.items():
154
152
  few_shot_data = []
155
153
  if self.few_shot_num > 0:
156
- with open(os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r') as f:
154
+ with open(
155
+ os.path.join(os.path.dirname(__file__), 'cot_prompts', f'{sub_name}.txt'), 'r',
156
+ encoding='utf-8') as f:
157
157
  cot_prompt_str = f.read()
158
158
  few_shot_data = [cot_prompt_str]
159
159
 
@@ -1,12 +1,13 @@
1
1
  import copy
2
- from dataclasses import dataclass, field
2
+ from collections import OrderedDict
3
+ from dataclasses import dataclass, field, fields
3
4
  from typing import TYPE_CHECKING, Dict, List, Optional
4
5
 
6
+ from evalscope.constants import OutputType
7
+
5
8
  if TYPE_CHECKING:
6
9
  from evalscope.benchmarks import DataAdapter
7
10
 
8
- from evalscope.models import BaseModelAdapter
9
-
10
11
  BENCHMARK_MAPPINGS = {}
11
12
 
12
13
 
@@ -15,8 +16,9 @@ class BenchmarkMeta:
15
16
  name: str
16
17
  dataset_id: str
17
18
  data_adapter: 'DataAdapter'
18
- model_adapter: BaseModelAdapter
19
- subset_list: List[str] = field(default_factory=list)
19
+ model_adapter: Optional[str] = OutputType.GENERATION
20
+ output_types: Optional[List[str]] = field(default_factory=lambda: [OutputType.GENERATION])
21
+ subset_list: List[str] = field(default_factory=lambda: ['default'])
20
22
  metric_list: List[str] = field(default_factory=list)
21
23
  few_shot_num: int = 0
22
24
  few_shot_random: bool = False
@@ -26,6 +28,8 @@ class BenchmarkMeta:
26
28
  system_prompt: Optional[str] = None
27
29
  query_template: Optional[str] = None
28
30
  pretty_name: Optional[str] = None
31
+ filters: Optional[OrderedDict] = None
32
+ extra_params: Optional[Dict] = field(default_factory=dict)
29
33
 
30
34
  def _update(self, args: dict):
31
35
  if args.get('local_path'):
@@ -37,12 +41,9 @@ class BenchmarkMeta:
37
41
  return self.__dict__
38
42
 
39
43
  def to_string_dict(self) -> dict:
40
- cur_dict = copy.deepcopy(self.__dict__)
44
+ cur_dict = copy.deepcopy(self.to_dict())
41
45
  # cur_dict['data_adapter'] = self.data_adapter.__name__
42
- # cur_dict['model_adapter'] = self.model_adapter.__name__
43
- # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
44
46
  del cur_dict['data_adapter']
45
- del cur_dict['model_adapter']
46
47
  return cur_dict
47
48
 
48
49
  def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -66,13 +67,13 @@ class Benchmark:
66
67
  return benchmark
67
68
 
68
69
  @classmethod
69
- def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
70
+ def register(cls, name: str, dataset_id: str, **kwargs):
70
71
 
71
72
  def register_wrapper(data_adapter):
72
73
  if name in BENCHMARK_MAPPINGS:
73
74
  raise Exception(f'Benchmark {name} already registered')
74
75
  BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
75
- name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
76
+ name=name, data_adapter=data_adapter, dataset_id=dataset_id, **kwargs)
76
77
  return data_adapter
77
78
 
78
79
  return register_wrapper
@@ -3,9 +3,8 @@ import csv
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -126,19 +125,20 @@ SUBJECT_MAPPING = {
126
125
 
127
126
  @Benchmark.register(
128
127
  name='ceval',
128
+ pretty_name='C-Eval',
129
129
  dataset_id='modelscope/ceval-exam',
130
- model_adapter=MultiChoiceModelAdapter,
130
+ model_adapter=OutputType.MULTIPLE_CHOICE,
131
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
131
132
  subset_list=SUBSET_LIST,
132
133
  metric_list=['AverageAccuracy'],
133
134
  few_shot_num=0,
134
135
  train_split='dev',
135
136
  eval_split='val',
136
- prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
137
+ prompt_template=
138
+ '以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
137
139
  )
138
140
  class CEVALAdapter(DataAdapter):
139
141
 
140
- choices = ['A', 'B', 'C', 'D']
141
-
142
142
  def __init__(self, **kwargs):
143
143
 
144
144
  few_shot_num = kwargs.get('few_shot_num', 0)
@@ -148,6 +148,7 @@ class CEVALAdapter(DataAdapter):
148
148
  super().__init__(**kwargs)
149
149
 
150
150
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
151
+ self.choices = ['A', 'B', 'C', 'D']
151
152
 
152
153
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
153
154
  data_dict = {}
@@ -207,7 +208,7 @@ class CEVALAdapter(DataAdapter):
207
208
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
208
209
  full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
209
210
 
210
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
211
+ return self.gen_prompt_data(full_prompt)
211
212
 
212
213
  def get_gold_answer(self, input_d: dict) -> str:
213
214
  # Get the gold choice
@@ -225,22 +226,17 @@ class CEVALAdapter(DataAdapter):
225
226
  Returns:
226
227
  The parsed answer. Depending on the dataset. Usually a string for chat.
227
228
  """
228
- if eval_type == EvalType.CHECKPOINT:
229
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
229
230
  return result
230
- elif eval_type == EvalType.SERVICE:
231
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
232
- elif eval_type == EvalType.CUSTOM:
233
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
234
231
  else:
235
- raise ValueError(f'Invalid eval_type: {eval_type}')
232
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
236
233
 
237
234
  def match(self, gold: str, pred: str) -> float:
238
235
  return exact_match(gold=gold, pred=pred)
239
236
 
240
- @classmethod
241
- def _format_example(cls, input_d: dict, include_answer=True):
237
+ def _format_example(self, input_d: dict, include_answer=True):
242
238
  example = '问题:' + input_d['question']
243
- for choice in cls.choices:
239
+ for choice in self.choices:
244
240
  example += f'\n{choice}. {input_d[f"{choice}"]}'
245
241
 
246
242
  if include_answer: