evalscope 0.10.1__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (317) hide show
  1. {evalscope-0.10.1/evalscope.egg-info → evalscope-0.11.0}/PKG-INFO +14 -5
  2. {evalscope-0.10.1 → evalscope-0.11.0}/README.md +1 -0
  3. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/arguments.py +1 -0
  4. evalscope-0.11.0/evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  5. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/arc/arc_adapter.py +5 -7
  6. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  7. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/benchmark.py +2 -2
  8. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  9. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  10. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  11. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/data_adapter.py +18 -12
  12. evalscope-0.11.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope-0.11.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  14. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  15. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
  16. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  17. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  18. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  19. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
  20. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  21. evalscope-0.11.0/evalscope/benchmarks/math_500/__init__.py +0 -0
  22. evalscope-0.11.0/evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  23. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  24. evalscope-0.11.0/evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  25. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  26. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/race_adapter.py +3 -3
  27. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  28. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  29. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/evaluator.py +103 -39
  30. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/sampler.py +2 -1
  31. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/schema.py +1 -2
  32. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/config.py +1 -0
  33. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/evaluator.py +78 -64
  34. evalscope-0.11.0/evalscope/metrics/math_parser.py +526 -0
  35. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/metrics.py +16 -1
  36. evalscope-0.11.0/evalscope/metrics/named_metrics.py +41 -0
  37. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/chat_adapter.py +69 -49
  38. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/choice_adapter.py +52 -45
  39. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom_adapter.py +2 -2
  40. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/local_model.py +4 -0
  41. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/server_adapter.py +28 -34
  42. evalscope-0.11.0/evalscope/perf/utils/__init__.py +0 -0
  43. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/app.py +30 -15
  44. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/run.py +10 -7
  45. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/chat_service.py +2 -2
  46. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/io_utils.py +1 -1
  47. evalscope-0.11.0/evalscope/version.py +4 -0
  48. {evalscope-0.10.1 → evalscope-0.11.0/evalscope.egg-info}/PKG-INFO +14 -5
  49. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/SOURCES.txt +9 -2
  50. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/requires.txt +12 -4
  51. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/app.txt +1 -1
  52. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/framework.txt +6 -2
  53. {evalscope-0.10.1 → evalscope-0.11.0}/tests/cli/test_run.py +93 -16
  54. evalscope-0.11.0/tests/rag/__init__.py +0 -0
  55. evalscope-0.10.1/evalscope/benchmarks/ceval/samples.jsonl +0 -1
  56. evalscope-0.10.1/evalscope/metrics/math_accuracy.py +0 -200
  57. evalscope-0.10.1/evalscope/metrics/named_metrics.py +0 -17
  58. evalscope-0.10.1/evalscope/version.py +0 -4
  59. {evalscope-0.10.1 → evalscope-0.11.0}/LICENSE +0 -0
  60. {evalscope-0.10.1 → evalscope-0.11.0}/MANIFEST.in +0 -0
  61. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/__init__.py +0 -0
  62. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/__init__.py +0 -0
  63. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/base.py +0 -0
  64. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/__init__.py +0 -0
  65. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  66. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  67. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  68. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  69. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  70. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  71. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  72. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  73. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  74. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  75. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  76. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  77. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  78. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  79. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  80. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  81. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  82. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  83. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  84. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  85. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  86. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  87. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  88. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  89. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  90. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  91. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  92. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  93. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  94. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  95. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  96. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  97. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  98. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  99. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  100. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  101. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  102. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  103. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  104. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  105. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  106. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  107. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  108. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  109. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  110. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  111. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/__init__.py +0 -0
  112. {evalscope-0.10.1/evalscope/benchmarks/gpqa → evalscope-0.11.0/evalscope/benchmarks/aime24}/__init__.py +0 -0
  113. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  114. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  115. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  116. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  117. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  118. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  119. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  120. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  121. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  122. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  123. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  124. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  125. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  126. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  127. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  128. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  129. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  130. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  131. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  132. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  133. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  134. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  135. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  136. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  137. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  138. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  139. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  140. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  141. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  142. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  143. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  144. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  145. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  146. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  147. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  148. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  149. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  150. {evalscope-0.10.1/evalscope/benchmarks/ifeval → evalscope-0.11.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  151. {evalscope-0.10.1/evalscope/benchmarks/iquiz → evalscope-0.11.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  152. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  153. {evalscope-0.10.1/evalscope/benchmarks/mmlu_pro → evalscope-0.11.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  154. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  155. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  156. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  157. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  158. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  159. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  160. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  161. {evalscope-0.10.1/evalscope/perf/utils → evalscope-0.11.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  162. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  163. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  164. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  165. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  166. {evalscope-0.10.1/tests/rag → evalscope-0.11.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  167. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  168. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  169. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  170. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/__init__.py +0 -0
  171. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/race.py +0 -0
  172. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  173. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  174. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  175. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  176. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  177. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  178. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/__init__.py +0 -0
  179. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/base.py +0 -0
  180. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/cli.py +0 -0
  181. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_app.py +0 -0
  182. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_eval.py +0 -0
  183. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_perf.py +0 -0
  184. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/cli/start_server.py +0 -0
  185. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/collections/__init__.py +0 -0
  186. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/constants.py +0 -0
  187. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/__init__.py +0 -0
  188. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/rating_eval.py +0 -0
  189. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  190. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  191. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/__init__.py +0 -0
  192. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  193. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  194. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/code_metric.py +0 -0
  195. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  196. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  197. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/metrics/rouge_metric.py +0 -0
  198. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/__init__.py +0 -0
  199. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/base_adapter.py +0 -0
  200. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom/__init__.py +0 -0
  201. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom/custom_model.py +0 -0
  202. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/custom/dummy_model.py +0 -0
  203. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/models/model.py +0 -0
  204. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/__init__.py +0 -0
  205. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/arguments.py +0 -0
  206. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/benchmark.py +0 -0
  207. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/http_client.py +0 -0
  208. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/main.py +0 -0
  209. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/__init__.py +0 -0
  210. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  211. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/base.py +0 -0
  212. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
  213. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  214. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
  215. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  216. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  217. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  218. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  219. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  220. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  221. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  222. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  223. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/plugin/registry.py +0 -0
  224. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/analysis_result.py +0 -0
  225. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/benchmark_util.py +0 -0
  226. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/db_util.py +0 -0
  227. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/handler.py +0 -0
  228. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/perf/utils/local_server.py +0 -0
  229. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/__init__.py +0 -0
  230. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  231. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  232. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  233. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  234. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  235. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  236. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  237. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  238. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/data/question.jsonl +0 -0
  239. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/arc.yaml +0 -0
  240. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  241. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  242. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  243. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  244. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  245. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  246. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  247. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  248. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  249. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  250. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/__init__.py +0 -0
  251. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/combinator.py +0 -0
  252. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/generator.py +0 -0
  253. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/report/utils.py +0 -0
  254. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/run_arena.py +0 -0
  255. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/summarizer.py +0 -0
  256. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/__init__.py +0 -0
  257. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/README.md +0 -0
  258. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  259. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  260. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  261. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  262. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  263. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  264. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  265. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  266. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  267. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  268. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  269. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  270. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  271. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  272. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  273. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  274. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  275. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  276. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  277. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  278. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  279. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  280. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  281. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  282. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  283. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/__init__.py +0 -0
  284. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/arena_utils.py +0 -0
  285. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/completion_parsers.py +0 -0
  286. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/logger.py +0 -0
  287. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/model_utils.py +0 -0
  288. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope/utils/utils.py +0 -0
  289. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/dependency_links.txt +0 -0
  290. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/entry_points.txt +0 -0
  291. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/not-zip-safe +0 -0
  292. {evalscope-0.10.1 → evalscope-0.11.0}/evalscope.egg-info/top_level.txt +0 -0
  293. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/docs.txt +0 -0
  294. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/inner.txt +0 -0
  295. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/opencompass.txt +0 -0
  296. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/perf.txt +0 -0
  297. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/rag.txt +0 -0
  298. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/tests.txt +0 -0
  299. {evalscope-0.10.1 → evalscope-0.11.0}/requirements/vlmeval.txt +0 -0
  300. {evalscope-0.10.1 → evalscope-0.11.0}/requirements.txt +0 -0
  301. {evalscope-0.10.1 → evalscope-0.11.0}/setup.cfg +0 -0
  302. {evalscope-0.10.1 → evalscope-0.11.0}/setup.py +0 -0
  303. {evalscope-0.10.1 → evalscope-0.11.0}/tests/__init__.py +0 -0
  304. {evalscope-0.10.1 → evalscope-0.11.0}/tests/cli/__init__.py +0 -0
  305. {evalscope-0.10.1 → evalscope-0.11.0}/tests/cli/test_collection.py +0 -0
  306. {evalscope-0.10.1 → evalscope-0.11.0}/tests/perf/__init__.py +0 -0
  307. {evalscope-0.10.1 → evalscope-0.11.0}/tests/perf/test_perf.py +0 -0
  308. {evalscope-0.10.1 → evalscope-0.11.0}/tests/rag/test_clip_benchmark.py +0 -0
  309. {evalscope-0.10.1 → evalscope-0.11.0}/tests/rag/test_mteb.py +0 -0
  310. {evalscope-0.10.1 → evalscope-0.11.0}/tests/rag/test_ragas.py +0 -0
  311. {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/__init__.py +0 -0
  312. {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/test_run_swift_eval.py +0 -0
  313. {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  314. {evalscope-0.10.1 → evalscope-0.11.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  315. {evalscope-0.10.1 → evalscope-0.11.0}/tests/test_run_all.py +0 -0
  316. {evalscope-0.10.1 → evalscope-0.11.0}/tests/vlm/__init__.py +0 -0
  317. {evalscope-0.10.1 → evalscope-0.11.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.10.1
3
+ Version: 0.11.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -19,10 +19,12 @@ License-File: LICENSE
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
- Requires-Dist: datasets<=3.0.1,>=3.0.0
22
+ Requires-Dist: datasets<=3.2.0,>=3.0.0
23
23
  Requires-Dist: editdistance
24
24
  Requires-Dist: jieba
25
25
  Requires-Dist: jsonlines
26
+ Requires-Dist: langdetect
27
+ Requires-Dist: latex2sympy2
26
28
  Requires-Dist: matplotlib
27
29
  Requires-Dist: modelscope[framework]
28
30
  Requires-Dist: nltk>=3.9
@@ -42,12 +44,14 @@ Requires-Dist: scikit-learn
42
44
  Requires-Dist: seaborn
43
45
  Requires-Dist: sentencepiece
44
46
  Requires-Dist: simple-ddl-parser
47
+ Requires-Dist: sympy
45
48
  Requires-Dist: tabulate
46
49
  Requires-Dist: tiktoken
47
50
  Requires-Dist: torch
48
51
  Requires-Dist: tqdm
49
52
  Requires-Dist: transformers>=4.33
50
53
  Requires-Dist: transformers_stream_generator
54
+ Requires-Dist: word2number
51
55
  Provides-Extra: opencompass
52
56
  Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
53
57
  Provides-Extra: vlmeval
@@ -64,7 +68,7 @@ Requires-Dist: sse_starlette; extra == "perf"
64
68
  Requires-Dist: transformers; extra == "perf"
65
69
  Requires-Dist: unicorn; extra == "perf"
66
70
  Provides-Extra: app
67
- Requires-Dist: gradio>=5.4.0; extra == "app"
71
+ Requires-Dist: gradio==5.4.0; extra == "app"
68
72
  Requires-Dist: plotly>=5.23.0; extra == "app"
69
73
  Provides-Extra: inner
70
74
  Requires-Dist: absl-py; extra == "inner"
@@ -96,10 +100,12 @@ Provides-Extra: all
96
100
  Requires-Dist: absl-py; extra == "all"
97
101
  Requires-Dist: accelerate; extra == "all"
98
102
  Requires-Dist: cachetools; extra == "all"
99
- Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
103
+ Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
100
104
  Requires-Dist: editdistance; extra == "all"
101
105
  Requires-Dist: jieba; extra == "all"
102
106
  Requires-Dist: jsonlines; extra == "all"
107
+ Requires-Dist: langdetect; extra == "all"
108
+ Requires-Dist: latex2sympy2; extra == "all"
103
109
  Requires-Dist: matplotlib; extra == "all"
104
110
  Requires-Dist: modelscope[framework]; extra == "all"
105
111
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -119,12 +125,14 @@ Requires-Dist: scikit-learn; extra == "all"
119
125
  Requires-Dist: seaborn; extra == "all"
120
126
  Requires-Dist: sentencepiece; extra == "all"
121
127
  Requires-Dist: simple-ddl-parser; extra == "all"
128
+ Requires-Dist: sympy; extra == "all"
122
129
  Requires-Dist: tabulate; extra == "all"
123
130
  Requires-Dist: tiktoken; extra == "all"
124
131
  Requires-Dist: torch; extra == "all"
125
132
  Requires-Dist: tqdm; extra == "all"
126
133
  Requires-Dist: transformers>=4.33; extra == "all"
127
134
  Requires-Dist: transformers_stream_generator; extra == "all"
135
+ Requires-Dist: word2number; extra == "all"
128
136
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
129
137
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
130
138
  Requires-Dist: mteb==1.19.4; extra == "all"
@@ -136,7 +144,7 @@ Requires-Dist: numpy; extra == "all"
136
144
  Requires-Dist: sse_starlette; extra == "all"
137
145
  Requires-Dist: transformers; extra == "all"
138
146
  Requires-Dist: unicorn; extra == "all"
139
- Requires-Dist: gradio>=5.4.0; extra == "all"
147
+ Requires-Dist: gradio==5.4.0; extra == "all"
140
148
  Requires-Dist: plotly>=5.23.0; extra == "all"
141
149
 
142
150
  <p align="center">
@@ -215,6 +223,7 @@ Please scan the QR code below to join our community groups:
215
223
 
216
224
 
217
225
  ## 🎉 News
226
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
218
227
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
228
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
229
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -74,6 +74,7 @@ Please scan the QR code below to join our community groups:
74
74
 
75
75
 
76
76
  ## 🎉 News
77
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
77
78
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
78
79
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
79
80
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
58
58
  parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
59
59
  choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
60
60
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
61
+ parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
61
62
 
62
63
  # Cache and working directory arguments
63
64
  parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='aime24',
13
+ dataset_id='HuggingFaceH4/aime_2024',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='train', # Only train set is available
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class AIME24Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['problem']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/ai2_arc',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
- metric_list=[AverageAccuracy],
23
+ metric_list=['AverageAccuracy'],
24
24
  few_shot_num=0,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
112
112
  # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
113
113
  full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
114
114
 
115
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
115
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
116
116
 
117
117
  def get_gold_answer(self, input_d: dict) -> str:
118
118
  # Get the gold choice
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
133
133
  if eval_type == EvalType.CHECKPOINT:
134
134
  return result
135
135
  elif eval_type == EvalType.SERVICE:
136
- return ResponseParser.parse_first_option_with_choices(
137
- text=result, options=self.choices) # TODO: to be checked !
136
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
138
137
  elif eval_type == EvalType.CUSTOM:
139
- return ResponseParser.parse_first_option_with_choices(
140
- text=result, options=self.choices) # TODO: to be checked !
138
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
141
139
  else:
142
140
  raise ValueError(f'Invalid eval_type: {eval_type}')
143
141
 
@@ -7,7 +7,7 @@ import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import AverageAccuracy, exact_match
10
+ from evalscope.metrics import exact_match
11
11
  from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
12
  from evalscope.utils import ResponseParser
13
13
  from evalscope.utils.logger import get_logger
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
63
63
  dataset_id='modelscope/bbh',
64
64
  model_adapter=ChatGenerationModelAdapter,
65
65
  subset_list=SUBSET_LIST,
66
- metric_list=[AverageAccuracy],
66
+ metric_list=['AverageAccuracy'],
67
67
  few_shot_num=3,
68
68
  train_split=None,
69
69
  eval_split='test',
70
- prompt_template='',
70
+ prompt_template="Q: {query}\nA: Let's think step by step.",
71
71
  )
72
72
  class BBHAdapter(DataAdapter):
73
73
  """
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
119
119
  {'data': ['xxx']}
120
120
  """
121
121
  # few_shot_list: should be ['xxxx']
122
- cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
123
- full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
122
+ if len(few_shot_list) > 0:
123
+ cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
124
+ else:
125
+ cot_prompts = ''
126
+ full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
124
127
 
125
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
128
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
126
129
 
127
130
  def gen_prompts(self, data_dict: dict) -> dict:
128
131
  """
@@ -177,9 +180,11 @@ class BBHAdapter(DataAdapter):
177
180
 
178
181
  def get_gold_answer(self, input_d: dict) -> str:
179
182
  # Get the gold choice
180
- gold = input_d.get('target')
183
+ gold = input_d.get('target', '')
184
+ # remove brackets
181
185
  if gold is None:
182
186
  logger.error(f'BBHAdapter: gold is None.')
187
+ gold = gold.replace('(', '').replace(')', '')
183
188
  return gold
184
189
 
185
190
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -228,8 +233,11 @@ class BBHAdapter(DataAdapter):
228
233
  """
229
234
  Extract the answer from the model output for Free-form task.
230
235
  """
231
- res = ResponseParser.parse_first_option(ans)
232
- if res:
236
+ pattern = r'answer is\s+(.*?)\.'
237
+
238
+ match = re.search(pattern, ans)
239
+ if match:
240
+ res = match.group(1)
233
241
  return res
234
242
 
235
243
  ans_line = ans.split('answer is ')
@@ -17,12 +17,13 @@ class BenchmarkMeta:
17
17
  data_adapter: 'DataAdapter'
18
18
  model_adapter: BaseModelAdapter
19
19
  subset_list: List[str] = field(default_factory=list)
20
- metric_list: List[dict] = field(default_factory=list)
20
+ metric_list: List[str] = field(default_factory=list)
21
21
  few_shot_num: int = 0
22
22
  few_shot_random: bool = False
23
23
  train_split: Optional[str] = None
24
24
  eval_split: Optional[str] = None
25
25
  prompt_template: Optional[str] = None
26
+ system_prompt: Optional[str] = None
26
27
 
27
28
  def _update(self, args: dict):
28
29
  if args.get('local_path'):
@@ -40,7 +41,6 @@ class BenchmarkMeta:
40
41
  # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
41
42
  del cur_dict['data_adapter']
42
43
  del cur_dict['model_adapter']
43
- del cur_dict['metric_list']
44
44
  return cur_dict
45
45
 
46
46
  def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -4,10 +4,9 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy
8
- from evalscope.metrics.metrics import exact_match, weighted_mean
7
+ from evalscope.metrics.metrics import exact_match
9
8
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
11
10
  from evalscope.utils.logger import get_logger
12
11
 
13
12
  # flake8: noqa
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
130
129
  dataset_id='modelscope/ceval-exam',
131
130
  model_adapter=MultiChoiceModelAdapter,
132
131
  subset_list=SUBSET_LIST,
133
- metric_list=[AverageAccuracy],
132
+ metric_list=['AverageAccuracy'],
134
133
  few_shot_num=0,
135
134
  train_split='dev',
136
135
  eval_split='val',
136
+ prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
137
137
  )
138
138
  class CEVALAdapter(DataAdapter):
139
139
 
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
202
202
  else:
203
203
  context = ''
204
204
 
205
- full_prompt: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
205
+ query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
206
206
 
207
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
208
- full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
208
+ full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
209
209
 
210
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
210
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
211
211
 
212
212
  def get_gold_answer(self, input_d: dict) -> str:
213
213
  # Get the gold choice
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
228
228
  if eval_type == EvalType.CHECKPOINT:
229
229
  return result
230
230
  elif eval_type == EvalType.SERVICE:
231
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
231
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
232
232
  elif eval_type == EvalType.CUSTOM:
233
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
233
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
234
234
  else:
235
235
  raise ValueError(f'Invalid eval_type: {eval_type}')
236
236
 
@@ -5,9 +5,9 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.metrics import exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
- from evalscope.utils import ResponseParser, normalize_score
10
+ from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
12
12
 
13
13
  # flake8: noqa
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
106
106
  dataset_id='modelscope/cmmlu',
107
107
  model_adapter=MultiChoiceModelAdapter,
108
108
  subset_list=SUBSET_LIST,
109
- metric_list=[AverageAccuracy],
109
+ metric_list=['AverageAccuracy'],
110
110
  few_shot_num=5,
111
111
  train_split='dev',
112
112
  eval_split='test',
113
+ prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
113
114
  )
114
115
  class CMMLUAdapter(DataAdapter):
115
116
 
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
165
166
  {'data': [(context, continuation), ...]}
166
167
 
167
168
  """
168
- prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
169
169
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
170
-
171
- context: str = '\n'.join(few_shot_prompts) + '\n'
170
+ context = '\n'.join(few_shot_prompts) + '\n'
172
171
  context += self._generate_prompt(input_d=input_d, include_answer=False)
173
- context = prompt + context
174
172
 
175
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
173
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
176
174
 
177
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
175
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
178
176
 
179
177
  def get_gold_answer(self, input_d: dict) -> str:
180
178
  # Get the gold choice
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
195
193
  if eval_type == EvalType.CHECKPOINT:
196
194
  return result
197
195
  elif eval_type == EvalType.SERVICE:
198
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
196
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
199
197
  elif eval_type == EvalType.CUSTOM:
200
- return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
198
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
201
199
  else:
202
200
  raise ValueError(f'Invalid eval_type: {eval_type}')
203
201
 
@@ -3,10 +3,11 @@
3
3
  import glob
4
4
  import json
5
5
  import os
6
+ from collections import defaultdict
6
7
 
7
8
  from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.metrics import AverageAccuracy
9
- from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
9
+ from evalscope.constants import AnswerKeys
10
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
10
11
  from evalscope.models import ChatGenerationModelAdapter
11
12
  from evalscope.utils.logger import get_logger
12
13
 
@@ -19,12 +20,12 @@ logger = get_logger()
19
20
  name='competition_math',
20
21
  dataset_id='modelscope/competition_math',
21
22
  model_adapter=ChatGenerationModelAdapter,
22
- subset_list=['default'],
23
- metric_list=[AverageAccuracy],
23
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
24
+ metric_list=['AveragePass@1'],
24
25
  few_shot_num=4,
25
26
  train_split='train',
26
27
  eval_split='test',
27
- prompt_template='Put the final answer in \\boxed{}.',
28
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
28
29
  )
29
30
  class CompetitionMathAdapter(DataAdapter):
30
31
  """ To be tested for all models. """
@@ -39,8 +40,13 @@ class CompetitionMathAdapter(DataAdapter):
39
40
 
40
41
  super().__init__(**kwargs)
41
42
 
43
+ def load(self, **kwargs):
44
+ # default load all levels
45
+ kwargs['subset_list'] = ['default']
46
+ return super().load(**kwargs)
47
+
42
48
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
43
- data_dict: dict = {}
49
+ data_dict = defaultdict(dict)
44
50
  for subset_name in subset_list:
45
51
  for split_name in [self.train_split, self.eval_split]:
46
52
  if os.path.exists(dataset_name_or_path):
@@ -53,13 +59,25 @@ class CompetitionMathAdapter(DataAdapter):
53
59
  if os.path.exists(file_path):
54
60
  with open(file_path, 'r') as f:
55
61
  split_data.append(json.load(f))
56
- if subset_name in data_dict:
57
- data_dict[subset_name].update({split_name: split_data})
58
- else:
59
- data_dict[subset_name] = {split_name: split_data}
62
+ data_dict[subset_name][split_name] = split_data
60
63
 
61
64
  return data_dict
62
65
 
66
+ def gen_prompts(self, data_dict: dict) -> dict:
67
+ res_dict: dict = defaultdict(list)
68
+
69
+ # use level as subset
70
+ for sub_name, sub_data_dict in data_dict.items():
71
+ for sample_d in sub_data_dict[self.eval_split]:
72
+ level = sample_d['level']
73
+ if level not in self.subset_list:
74
+ continue
75
+ prompt_d = self.gen_prompt(input_d=sample_d, few_shot_list=None)
76
+ prompt_d[AnswerKeys.RAW_INPUT] = sample_d
77
+ res_dict[level].append(prompt_d)
78
+
79
+ return res_dict
80
+
63
81
  def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
64
82
  """
65
83
  Generate the prompt for the model input.
@@ -75,13 +93,13 @@ class CompetitionMathAdapter(DataAdapter):
75
93
  {'data': [prompt]}
76
94
  """
77
95
  use_fewshot = self.few_shot_num > 0
78
- full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
79
-
80
- return {'data': [full_prompt], 'system_prompt': self.prompt_template}
96
+ query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
97
+ full_prompt = self.prompt_template.format(query=query)
98
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
81
99
 
82
100
  def get_gold_answer(self, input_d: dict) -> str:
83
101
  # Extract the gold answer from the input dict.
84
- return remove_boxed(last_boxed_only_string(input_d['solution']))
102
+ return strip_answer_string(extract_answer(input_d['solution']))
85
103
 
86
104
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
87
105
  """
@@ -96,18 +114,11 @@ class CompetitionMathAdapter(DataAdapter):
96
114
  The parsed answer. Depending on the dataset. Usually a string for chat.
97
115
  """
98
116
  # Note: Use same extraction method for both of checkpoint/service/custom
99
- try:
100
- result = remove_boxed(last_boxed_only_string(result))
101
- except Exception:
102
- return None
117
+ result = strip_answer_string(extract_answer(result))
103
118
  return result
104
119
 
105
120
  def match(self, gold: str, pred: str) -> float:
106
- res = 0
107
- if is_equiv(pred, gold):
108
- res = 1
109
-
110
- return res
121
+ return math_equal(pred, gold)
111
122
 
112
123
  @classmethod
113
124
  def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
@@ -2,10 +2,10 @@
2
2
  import os.path
3
3
  import random
4
4
  from abc import ABC, abstractmethod
5
- from typing import Any, List, Optional
5
+ from typing import Any, List, Optional, Union
6
6
 
7
7
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
8
- from evalscope.metrics import Metric
8
+ from evalscope.metrics.named_metrics import metric_registry
9
9
  from evalscope.report import Report, ReportGenerator
10
10
  from evalscope.utils.logger import get_logger
11
11
 
@@ -16,12 +16,14 @@ class DataAdapter(ABC):
16
16
 
17
17
  def __init__(self,
18
18
  name: str,
19
+ dataset_id: str,
19
20
  subset_list: list,
20
- metric_list: List[Metric],
21
+ metric_list: List[str],
21
22
  few_shot_num: Optional[int] = 0,
22
23
  train_split: Optional[str] = None,
23
24
  eval_split: Optional[str] = None,
24
25
  prompt_template: Optional[str] = None,
26
+ system_prompt: Optional[str] = None,
25
27
  **kwargs):
26
28
  """
27
29
  Data Adapter for the benchmark. You need to implement the following methods:
@@ -31,6 +33,7 @@ class DataAdapter(ABC):
31
33
  - match
32
34
  Args:
33
35
  name: str, the name of the benchmark.
36
+ dataset_id: str, the dataset id on ModelScope or local path for the benchmark.
34
37
  subset_list: list of subset names for the dataset.
35
38
  metric_list: list, the metric list to evaluate the model on specific benchmark.
36
39
  few_shot_num: int, number of few-shot examples. Default: 0
@@ -41,17 +44,19 @@ class DataAdapter(ABC):
41
44
  the form of A or B or C or D, do not output explanation:`
42
45
  """
43
46
  self.name = name
47
+ self.dataset_id = dataset_id
44
48
  self.subset_list = subset_list
45
49
  self.metric_list = metric_list
46
50
  self.few_shot_num = few_shot_num
47
51
  self.train_split = train_split
48
52
  self.eval_split = eval_split
49
53
  self.prompt_template = prompt_template
54
+ self.system_prompt = system_prompt
50
55
  self.config_kwargs = kwargs
51
56
  self.category_map = kwargs.get('category_map', {})
52
57
 
53
58
  def load(self,
54
- dataset_name_or_path: str,
59
+ dataset_name_or_path: str = None,
55
60
  subset_list: list = None,
56
61
  work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
57
62
  datasets_hub: str = HubType.MODELSCOPE,
@@ -64,7 +69,7 @@ class DataAdapter(ABC):
64
69
  train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
65
70
 
66
71
  """
67
- dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
72
+ dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
68
73
  subset_list = subset_list or self.subset_list
69
74
 
70
75
  # Try to load dataset from local disk
@@ -156,7 +161,7 @@ class DataAdapter(ABC):
156
161
  else:
157
162
  return data_list[:k]
158
163
 
159
- def compute_metric(self, review_res_list: list) -> List[dict]:
164
+ def compute_metric(self, review_res_list: Union[dict, list]) -> List[dict]:
160
165
  """
161
166
  Compute evaluation result by specific metrics.
162
167
 
@@ -170,14 +175,15 @@ class DataAdapter(ABC):
170
175
  raise ValueError('No metric list found for the benchmark.')
171
176
 
172
177
  res_list = []
173
- for metric in self.metric_list:
178
+ for metric_str in self.metric_list:
179
+ metric = metric_registry.get(metric_str)
174
180
  metric_name = metric.name
175
181
  metric_func = metric.object
176
- res_list.append({
177
- 'metric_name': metric_name,
178
- 'score': metric_func(review_res_list),
179
- 'num': len(review_res_list)
180
- })
182
+ if isinstance(review_res_list, dict):
183
+ review_res = review_res_list.get(metric_name, [])
184
+ else:
185
+ review_res = review_res_list
186
+ res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
181
187
  return res_list
182
188
 
183
189
  def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report: