evalscope 0.8.2__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (314) hide show
  1. {evalscope-0.8.2/evalscope.egg-info → evalscope-0.9.0}/PKG-INFO +32 -15
  2. {evalscope-0.8.2 → evalscope-0.9.0}/README.md +31 -14
  3. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/__init__.py +2 -0
  4. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/arguments.py +10 -3
  5. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  6. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/llm.py +1 -1
  7. evalscope-0.9.0/evalscope/benchmarks/__init__.py +23 -0
  8. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/arc/arc_adapter.py +23 -99
  9. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope-0.9.0/evalscope/benchmarks/benchmark.py +76 -0
  11. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  12. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  13. evalscope-0.9.0/evalscope/benchmarks/competition_math/competition_math_adapter.py +126 -0
  14. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/data_adapter.py +114 -85
  15. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  16. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  17. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  18. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  19. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  20. evalscope-0.9.0/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  21. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/race/race_adapter.py +25 -53
  22. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  23. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  24. evalscope-0.9.0/evalscope/collections/__init__.py +3 -0
  25. evalscope-0.9.0/evalscope/collections/evaluator.py +178 -0
  26. evalscope-0.9.0/evalscope/collections/sampler.py +132 -0
  27. evalscope-0.9.0/evalscope/collections/schema.py +122 -0
  28. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/config.py +7 -5
  29. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/constants.py +7 -28
  30. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/evaluator.py +66 -109
  31. evalscope-0.9.0/evalscope/evaluator/reviewer/__init__.py +1 -0
  32. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  33. evalscope-0.9.0/evalscope/metrics/__init__.py +7 -0
  34. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  35. evalscope-0.9.0/evalscope/metrics/math_accuracy.py +200 -0
  36. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/metrics.py +7 -4
  37. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/rouge_metric.py +13 -8
  38. evalscope-0.9.0/evalscope/models/__init__.py +16 -0
  39. evalscope-0.9.0/evalscope/models/base_adapter.py +52 -0
  40. evalscope-0.9.0/evalscope/models/chat_adapter.py +138 -0
  41. evalscope-0.9.0/evalscope/models/choice_adapter.py +211 -0
  42. evalscope-0.9.0/evalscope/models/custom_adapter.py +67 -0
  43. evalscope-0.9.0/evalscope/models/local_model.py +74 -0
  44. evalscope-0.9.0/evalscope/models/model.py +229 -0
  45. evalscope-0.9.0/evalscope/models/server_adapter.py +104 -0
  46. evalscope-0.9.0/evalscope/registry/__init__.py +1 -0
  47. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/run.py +37 -66
  48. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/run_arena.py +1 -1
  49. evalscope-0.9.0/evalscope/third_party/__init__.py +1 -0
  50. evalscope-0.9.0/evalscope/third_party/longbench_write/resources/__init__.py +1 -0
  51. evalscope-0.9.0/evalscope/third_party/longbench_write/tools/__init__.py +1 -0
  52. evalscope-0.9.0/evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  53. evalscope-0.9.0/evalscope/tools/__init__.py +1 -0
  54. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/__init__.py +1 -1
  55. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/chat_service.py +4 -3
  56. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/io_utils.py +8 -0
  57. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/logger.py +4 -0
  58. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/model_utils.py +10 -0
  59. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/utils.py +3 -25
  60. evalscope-0.9.0/evalscope/version.py +4 -0
  61. {evalscope-0.8.2 → evalscope-0.9.0/evalscope.egg-info}/PKG-INFO +32 -15
  62. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/SOURCES.txt +14 -5
  63. evalscope-0.9.0/tests/__init__.py +1 -0
  64. evalscope-0.9.0/tests/cli/__init__.py +1 -0
  65. evalscope-0.9.0/tests/cli/test_collection.py +53 -0
  66. {evalscope-0.8.2 → evalscope-0.9.0}/tests/cli/test_run.py +43 -1
  67. evalscope-0.9.0/tests/perf/__init__.py +1 -0
  68. evalscope-0.9.0/tests/rag/__init__.py +0 -0
  69. {evalscope-0.8.2 → evalscope-0.9.0}/tests/rag/test_mteb.py +3 -2
  70. evalscope-0.9.0/tests/swift/__init__.py +1 -0
  71. evalscope-0.9.0/tests/vlm/__init__.py +1 -0
  72. evalscope-0.8.2/evalscope/benchmarks/__init__.py +0 -4
  73. evalscope-0.8.2/evalscope/benchmarks/arc/__init__.py +0 -6
  74. evalscope-0.8.2/evalscope/benchmarks/bbh/__init__.py +0 -5
  75. evalscope-0.8.2/evalscope/benchmarks/benchmark.py +0 -65
  76. evalscope-0.8.2/evalscope/benchmarks/ceval/__init__.py +0 -6
  77. evalscope-0.8.2/evalscope/benchmarks/cmmlu/__init__.py +0 -6
  78. evalscope-0.8.2/evalscope/benchmarks/competition_math/__init__.py +0 -6
  79. evalscope-0.8.2/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -468
  80. evalscope-0.8.2/evalscope/benchmarks/general_qa/__init__.py +0 -6
  81. evalscope-0.8.2/evalscope/benchmarks/gsm8k/__init__.py +0 -5
  82. evalscope-0.8.2/evalscope/benchmarks/hellaswag/__init__.py +0 -6
  83. evalscope-0.8.2/evalscope/benchmarks/humaneval/__init__.py +0 -5
  84. evalscope-0.8.2/evalscope/benchmarks/mmlu/__init__.py +0 -6
  85. evalscope-0.8.2/evalscope/benchmarks/race/__init__.py +0 -6
  86. evalscope-0.8.2/evalscope/benchmarks/trivia_qa/__init__.py +0 -6
  87. evalscope-0.8.2/evalscope/benchmarks/truthful_qa/__init__.py +0 -6
  88. evalscope-0.8.2/evalscope/metrics/math_accuracy.py +0 -57
  89. evalscope-0.8.2/evalscope/models/__init__.py +0 -3
  90. evalscope-0.8.2/evalscope/models/api/__init__.py +0 -3
  91. evalscope-0.8.2/evalscope/models/dummy_chat_model.py +0 -49
  92. evalscope-0.8.2/evalscope/models/model.py +0 -88
  93. evalscope-0.8.2/evalscope/models/model_adapter.py +0 -525
  94. evalscope-0.8.2/evalscope/models/openai_model.py +0 -103
  95. evalscope-0.8.2/evalscope/version.py +0 -4
  96. {evalscope-0.8.2 → evalscope-0.9.0}/LICENSE +0 -0
  97. {evalscope-0.8.2 → evalscope-0.9.0}/MANIFEST.in +0 -0
  98. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/__init__.py +0 -0
  99. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/base.py +0 -0
  100. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/__init__.py +0 -0
  101. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  102. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  103. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  104. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  105. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  106. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  107. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  108. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  109. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  110. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  111. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  112. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  113. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  114. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  115. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  116. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  117. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  118. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  119. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  120. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  121. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  122. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  123. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  124. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  125. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  126. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  127. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  128. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  129. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  130. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  131. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  132. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  133. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  134. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  135. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  136. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  137. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  138. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  139. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  140. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  141. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  142. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  143. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  144. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  145. {evalscope-0.8.2/evalscope/cli → evalscope-0.9.0/evalscope/benchmarks/arc}/__init__.py +0 -0
  146. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  147. {evalscope-0.8.2/evalscope/evaluator/reviewer → evalscope-0.9.0/evalscope/benchmarks/bbh}/__init__.py +0 -0
  148. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  149. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  150. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  151. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  152. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  153. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  154. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  155. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  156. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  157. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  158. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  159. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  160. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  161. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  162. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  163. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  164. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  165. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  166. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  167. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  168. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  169. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  170. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  171. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  172. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  173. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  174. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  175. {evalscope-0.8.2/evalscope/metrics → evalscope-0.9.0/evalscope/benchmarks/ceval}/__init__.py +0 -0
  176. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  177. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
  178. {evalscope-0.8.2/evalscope/registry → evalscope-0.9.0/evalscope/benchmarks/cmmlu}/__init__.py +0 -0
  179. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  180. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  181. {evalscope-0.8.2/evalscope/third_party → evalscope-0.9.0/evalscope/benchmarks/competition_math}/__init__.py +0 -0
  182. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  183. {evalscope-0.8.2/evalscope/third_party/longbench_write/resources → evalscope-0.9.0/evalscope/benchmarks/general_qa}/__init__.py +0 -0
  184. {evalscope-0.8.2/evalscope/third_party/longbench_write/tools → evalscope-0.9.0/evalscope/benchmarks/gsm8k}/__init__.py +0 -0
  185. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  186. {evalscope-0.8.2/evalscope/third_party/toolbench_static/llm → evalscope-0.9.0/evalscope/benchmarks/hellaswag}/__init__.py +0 -0
  187. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  188. {evalscope-0.8.2/evalscope/tools → evalscope-0.9.0/evalscope/benchmarks/humaneval}/__init__.py +0 -0
  189. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  190. {evalscope-0.8.2/tests → evalscope-0.9.0/evalscope/benchmarks/mmlu}/__init__.py +0 -0
  191. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  192. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  193. {evalscope-0.8.2/evalscope/perf → evalscope-0.9.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  194. {evalscope-0.8.2/tests/cli → evalscope-0.9.0/evalscope/benchmarks/race}/__init__.py +0 -0
  195. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/race/race.py +0 -0
  196. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  197. {evalscope-0.8.2/tests/perf → evalscope-0.9.0/evalscope/benchmarks/trivia_qa}/__init__.py +0 -0
  198. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  199. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  200. {evalscope-0.8.2/tests/swift → evalscope-0.9.0/evalscope/benchmarks/truthful_qa}/__init__.py +0 -0
  201. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  202. {evalscope-0.8.2/tests/vlm → evalscope-0.9.0/evalscope/cli}/__init__.py +0 -0
  203. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/base.py +0 -0
  204. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/cli.py +0 -0
  205. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/start_eval.py +0 -0
  206. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/start_perf.py +0 -0
  207. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/cli/start_server.py +0 -0
  208. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/__init__.py +0 -0
  209. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/evaluator/rating_eval.py +0 -0
  210. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  211. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/code_metric.py +0 -0
  212. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  213. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  214. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/models/custom/__init__.py +0 -0
  215. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/models/custom/custom_model.py +0 -0
  216. {evalscope-0.8.2/evalscope/perf/utils → evalscope-0.9.0/evalscope/perf}/__init__.py +0 -0
  217. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/arguments.py +0 -0
  218. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/benchmark.py +0 -0
  219. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/http_client.py +0 -0
  220. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/main.py +0 -0
  221. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/__init__.py +0 -0
  222. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  223. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/base.py +0 -0
  224. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
  225. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  226. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
  227. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  228. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  229. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  230. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  231. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  232. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  233. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  234. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  235. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/plugin/registry.py +0 -0
  236. {evalscope-0.8.2/tests/rag → evalscope-0.9.0/evalscope/perf/utils}/__init__.py +0 -0
  237. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/analysis_result.py +0 -0
  238. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/benchmark_util.py +0 -0
  239. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/db_util.py +0 -0
  240. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/handler.py +0 -0
  241. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/perf/utils/local_server.py +0 -0
  242. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  243. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  244. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  245. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  246. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  247. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  248. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  249. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  250. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/data/question.jsonl +0 -0
  251. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/arc.yaml +0 -0
  252. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  253. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  254. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  255. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  256. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  257. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  258. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  259. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  260. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  261. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  262. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/summarizer.py +0 -0
  263. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/README.md +0 -0
  264. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  265. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  266. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  267. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  268. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  269. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  270. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  271. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  272. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  273. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  274. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  275. {evalscope-0.8.2/evalscope/models/api → evalscope-0.9.0/evalscope/third_party/longbench_write/tools}/openai_api.py +0 -0
  276. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  277. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  278. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  279. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  280. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  281. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  282. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  283. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  284. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  285. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  286. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/tools/combine_reports.py +0 -0
  287. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  288. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/tools/rewrite_eval_results.py +0 -0
  289. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/arena_utils.py +0 -0
  290. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope/utils/completion_parsers.py +0 -0
  291. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/dependency_links.txt +0 -0
  292. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/entry_points.txt +0 -0
  293. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/not-zip-safe +0 -0
  294. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/requires.txt +0 -0
  295. {evalscope-0.8.2 → evalscope-0.9.0}/evalscope.egg-info/top_level.txt +0 -0
  296. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/docs.txt +0 -0
  297. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/framework.txt +0 -0
  298. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/inner.txt +0 -0
  299. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/opencompass.txt +0 -0
  300. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/perf.txt +0 -0
  301. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/rag.txt +0 -0
  302. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/tests.txt +0 -0
  303. {evalscope-0.8.2 → evalscope-0.9.0}/requirements/vlmeval.txt +0 -0
  304. {evalscope-0.8.2 → evalscope-0.9.0}/requirements.txt +0 -0
  305. {evalscope-0.8.2 → evalscope-0.9.0}/setup.cfg +0 -0
  306. {evalscope-0.8.2 → evalscope-0.9.0}/setup.py +0 -0
  307. {evalscope-0.8.2 → evalscope-0.9.0}/tests/perf/test_perf.py +0 -0
  308. {evalscope-0.8.2 → evalscope-0.9.0}/tests/rag/test_clip_benchmark.py +0 -0
  309. {evalscope-0.8.2 → evalscope-0.9.0}/tests/rag/test_ragas.py +0 -0
  310. {evalscope-0.8.2 → evalscope-0.9.0}/tests/swift/test_run_swift_eval.py +0 -0
  311. {evalscope-0.8.2 → evalscope-0.9.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  312. {evalscope-0.8.2 → evalscope-0.9.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  313. {evalscope-0.8.2 → evalscope-0.9.0}/tests/test_run_all.py +0 -0
  314. {evalscope-0.8.2 → evalscope-0.9.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.2
3
+ Version: 0.9.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -160,14 +160,16 @@ Requires-Dist: unicorn; extra == "all"
160
160
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
161
161
 
162
162
  ## 📋 Contents
163
- - [Introduction](#introduction)
164
- - [News](#News)
165
- - [Installation](#installation)
166
- - [Quick Start](#quick-start)
163
+ - [Introduction](#-introduction)
164
+ - [News](#-news)
165
+ - [Installation](#️-installation)
166
+ - [Quick Start](#-quick-start)
167
167
  - [Evaluation Backend](#evaluation-backend)
168
- - [Custom Dataset Evaluation](#custom-dataset-evaluation)
169
- - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
170
- - [Arena Mode](#arena-mode)
168
+ - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
169
+ - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
170
+ - [Arena Mode](#-arena-mode)
171
+ - [Contribution](#️-contribution)
172
+ - [Roadmap](#-roadmap)
171
173
 
172
174
 
173
175
  ## 📝 Introduction
@@ -208,11 +210,15 @@ Please scan the QR code below to join our community groups:
208
210
 
209
211
 
210
212
  ## 🎉 News
213
+ - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
211
214
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
212
215
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
213
216
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
214
217
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
215
218
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
219
+
220
+ <details><summary>More</summary>
221
+
216
222
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
217
223
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
218
224
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -224,7 +230,7 @@ Please scan the QR code below to join our community groups:
224
230
  - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
225
231
  - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
226
232
 
227
-
233
+ </details>
228
234
 
229
235
  ## 🛠️ Installation
230
236
  ### Method 1: Install Using pip
@@ -414,7 +420,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
414
420
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
415
421
 
416
422
 
417
- ## Model Serving Performance Evaluation
423
+ ## 📈 Model Serving Performance Evaluation
418
424
  A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
419
425
 
420
426
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -439,19 +445,32 @@ Speed Benchmark Results:
439
445
  +---------------+-----------------+----------------+
440
446
  ```
441
447
 
442
- ## Custom Dataset Evaluation
448
+ ## 🖊️ Custom Dataset Evaluation
443
449
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
444
450
 
445
451
 
446
- ## Arena Mode
452
+ ## 🏟️ Arena Mode
447
453
  The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
448
454
 
449
455
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
450
456
 
457
+ ## 👷‍♂️ Contribution
451
458
 
459
+ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
452
460
 
461
+ <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
462
+ <table>
463
+ <tr>
464
+ <th colspan="2">
465
+ <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
466
+ </th>
467
+ </tr>
468
+ </table>
469
+ </a>
453
470
 
454
- ## TO-DO List
471
+ ## 🔜 Roadmap
472
+ - [ ] Support for better evaluation report visualization
473
+ - [x] Support for mixed evaluations across multiple datasets
455
474
  - [x] RAG evaluation
456
475
  - [x] VLM evaluation
457
476
  - [x] Agents evaluation
@@ -462,8 +481,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
462
481
  - [ ] GAIA
463
482
  - [ ] GPQA
464
483
  - [x] MBPP
465
- - [ ] Auto-reviewer
466
- - [ ] Qwen-max
467
484
 
468
485
 
469
486
  ## Star History
@@ -24,14 +24,16 @@
24
24
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
25
25
 
26
26
  ## 📋 Contents
27
- - [Introduction](#introduction)
28
- - [News](#News)
29
- - [Installation](#installation)
30
- - [Quick Start](#quick-start)
27
+ - [Introduction](#-introduction)
28
+ - [News](#-news)
29
+ - [Installation](#️-installation)
30
+ - [Quick Start](#-quick-start)
31
31
  - [Evaluation Backend](#evaluation-backend)
32
- - [Custom Dataset Evaluation](#custom-dataset-evaluation)
33
- - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
34
- - [Arena Mode](#arena-mode)
32
+ - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
33
+ - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
34
+ - [Arena Mode](#-arena-mode)
35
+ - [Contribution](#️-contribution)
36
+ - [Roadmap](#-roadmap)
35
37
 
36
38
 
37
39
  ## 📝 Introduction
@@ -72,11 +74,15 @@ Please scan the QR code below to join our community groups:
72
74
 
73
75
 
74
76
  ## 🎉 News
77
+ - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
75
78
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
76
79
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
77
80
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
78
81
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
79
82
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
83
+
84
+ <details><summary>More</summary>
85
+
80
86
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
81
87
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
82
88
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -88,7 +94,7 @@ Please scan the QR code below to join our community groups:
88
94
  - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
89
95
  - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
90
96
 
91
-
97
+ </details>
92
98
 
93
99
  ## 🛠️ Installation
94
100
  ### Method 1: Install Using pip
@@ -278,7 +284,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
278
284
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
279
285
 
280
286
 
281
- ## Model Serving Performance Evaluation
287
+ ## 📈 Model Serving Performance Evaluation
282
288
  A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
283
289
 
284
290
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -303,19 +309,32 @@ Speed Benchmark Results:
303
309
  +---------------+-----------------+----------------+
304
310
  ```
305
311
 
306
- ## Custom Dataset Evaluation
312
+ ## 🖊️ Custom Dataset Evaluation
307
313
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
308
314
 
309
315
 
310
- ## Arena Mode
316
+ ## 🏟️ Arena Mode
311
317
  The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
312
318
 
313
319
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
314
320
 
321
+ ## 👷‍♂️ Contribution
315
322
 
323
+ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
316
324
 
325
+ <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
326
+ <table>
327
+ <tr>
328
+ <th colspan="2">
329
+ <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
330
+ </th>
331
+ </tr>
332
+ </table>
333
+ </a>
317
334
 
318
- ## TO-DO List
335
+ ## 🔜 Roadmap
336
+ - [ ] Support for better evaluation report visualization
337
+ - [x] Support for mixed evaluations across multiple datasets
319
338
  - [x] RAG evaluation
320
339
  - [x] VLM evaluation
321
340
  - [x] Agents evaluation
@@ -326,8 +345,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
326
345
  - [ ] GAIA
327
346
  - [ ] GPQA
328
347
  - [x] MBPP
329
- - [ ] Auto-reviewer
330
- - [ ] Qwen-max
331
348
 
332
349
 
333
350
  ## Star History
@@ -1,3 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ from evalscope.config import TaskConfig
4
+ from evalscope.run import run_task
3
5
  from .version import __release_datetime__, __version__
@@ -1,6 +1,8 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType
5
+
4
6
 
5
7
  class ParseStrArgsAction(argparse.Action):
6
8
 
@@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
47
49
  parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
48
50
 
49
51
  # Evaluation-related arguments
50
- parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
51
- parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
52
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
53
+ choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
54
+ parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
55
+ choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
52
56
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
53
- parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
57
+ parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
58
+ choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
54
59
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
55
60
 
56
61
  # Cache and working directory arguments
@@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
62
67
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
63
68
  parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
64
69
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
70
+ parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
71
+ parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
65
72
  # yapf: enable
66
73
 
67
74
 
@@ -3,7 +3,6 @@ Code adapated from https://github.com/mlfoundations/open_clip/blob/main/src/trai
3
3
  Thanks to the authors of OpenCLIP
4
4
  """
5
5
 
6
- import logging
7
6
  import torch
8
7
  import torch.nn.functional as F
9
8
  from contextlib import suppress
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter
10
10
 
11
11
 
12
12
  class LLM:
@@ -0,0 +1,23 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import glob
3
+ import importlib
4
+ import os
5
+
6
+ from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
7
+ from evalscope.benchmarks.data_adapter import DataAdapter
8
+ from evalscope.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+ # Using glob to find all files matching the pattern
13
+ pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
14
+ files = glob.glob(pattern, recursive=False)
15
+
16
+ for file_path in files:
17
+ if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
18
+ # Convert file path to a module path
19
+ relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
20
+ module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
21
+ full_path = f'evalscope.benchmarks.{module_path}'
22
+ importlib.import_module(full_path)
23
+ # print(f'Importing {full_path}')
@@ -3,40 +3,35 @@
3
3
  import json
4
4
  import os
5
5
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import ResponseParser, normalize_score
6
+ from evalscope.benchmarks import Benchmark, DataAdapter
7
+ from evalscope.constants import EvalType
8
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
9
+ from evalscope.models import MultiChoiceModelAdapter
10
+ from evalscope.utils import ResponseParser
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  # flake8: noqa
12
14
 
13
15
  logger = get_logger()
14
16
 
15
- DATASET_ID = 'modelscope/ai2_arc'
16
-
17
- # task_list = ['ARC-Easy', 'ARC-Challenge']
18
- SUBSET_LIST = ['ARC-Challenge']
19
-
20
17
 
18
+ @Benchmark.register(
19
+ name='arc',
20
+ dataset_id='modelscope/ai2_arc',
21
+ model_adapter=MultiChoiceModelAdapter,
22
+ subset_list=['ARC-Easy', 'ARC-Challenge'],
23
+ metric_list=[WeightedAverageAccuracy],
24
+ few_shot_num=0,
25
+ train_split='train',
26
+ eval_split='test',
27
+ prompt_template='',
28
+ )
21
29
  class ARCAdapter(DataAdapter):
22
30
 
23
31
  choices = ['A', 'B', 'C', 'D']
24
32
 
25
- def __init__(self,
26
- subset_list: list = None,
27
- metric_list: list = None,
28
- few_shot_num: int = None,
29
- train_split: str = 'train',
30
- eval_split: str = 'test',
31
- prompt_template: str = '',
32
- **kwargs):
33
-
34
- if subset_list is None:
35
- subset_list = SUBSET_LIST
36
-
37
- if metric_list is None:
38
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
39
-
33
+ def __init__(self, **kwargs):
34
+ few_shot_num = kwargs.get('few_shot_num', None)
40
35
  if few_shot_num is None:
41
36
  # Use 0-shot by default
42
37
  logger.info(f'Set 0-shot examples by system for ARC.')
@@ -45,14 +40,7 @@ class ARCAdapter(DataAdapter):
45
40
  if few_shot_num != 0:
46
41
  logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
47
42
 
48
- super().__init__(
49
- subset_list=subset_list,
50
- metric_list=metric_list,
51
- few_shot_num=few_shot_num,
52
- train_split=train_split,
53
- eval_split=eval_split,
54
- prompt_template=prompt_template,
55
- **kwargs)
43
+ super().__init__(**kwargs)
56
44
 
57
45
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
58
46
  """
@@ -132,7 +120,7 @@ class ARCAdapter(DataAdapter):
132
120
  # Get the gold choice
133
121
  return input_d.get('answerKey', '')
134
122
 
135
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
123
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
136
124
  """
137
125
  Parse the model output to get the answer. Could be the best choice index.
138
126
 
@@ -144,12 +132,12 @@ class ARCAdapter(DataAdapter):
144
132
  Returns:
145
133
  The parsed answer. Depending on the dataset. Usually a string for chat.
146
134
  """
147
- if eval_type == 'checkpoint':
135
+ if eval_type == EvalType.CHECKPOINT:
148
136
  return result
149
- elif eval_type == 'service':
137
+ elif eval_type == EvalType.SERVICE:
150
138
  return ResponseParser.parse_first_option_with_choices(
151
139
  text=result, options=self.choices) # TODO: to be checked !
152
- elif eval_type == 'custom':
140
+ elif eval_type == EvalType.CUSTOM:
153
141
  return ResponseParser.parse_first_option_with_choices(
154
142
  text=result, options=self.choices) # TODO: to be checked !
155
143
  else:
@@ -158,70 +146,6 @@ class ARCAdapter(DataAdapter):
158
146
  def match(self, gold: str, pred: str) -> float:
159
147
  return exact_match(gold=gold, pred=pred)
160
148
 
161
- def compute_metric(self, review_res_list: list) -> float:
162
- """
163
- Compute evaluation result by specific metric.
164
-
165
- Args:
166
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
167
-
168
- Returns:
169
- The metric score.
170
- """
171
- items = [(score, 1.0) for score in review_res_list]
172
- return weighted_mean(items)
173
-
174
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
175
- """
176
- Generate the report for the model output.
177
-
178
- Args:
179
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
180
- report_name: The user-defined report name.
181
-
182
- Returns: A dict of metric calculation results. The format is like:
183
- {
184
- "name":"ARC",
185
- "metric":"WeightedAverageAccuracy",
186
- "score":0.3389,
187
- "category":[
188
- {
189
- "name":"DEFAULT",
190
- "score":0.4128,
191
- "subset":[
192
- {
193
- "name":"ARC-Easy",
194
- "score":0.5632
195
- },
196
- {
197
- "name":"ARC-Challenge",
198
- "score":0.3157
199
- }
200
- ]
201
- }
202
- ],
203
- "total_num":7800
204
- }
205
- """
206
- total_num: int = sum([num for _, num in subset_score_map.values()])
207
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
208
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
209
- cate_avg_list = [{
210
- 'name': subset_name,
211
- 'score': normalize_score(score=score)
212
- } for subset_name, (score, _) in subset_score_map.items()]
213
-
214
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
215
-
216
- res_map = dict(
217
- name=report_name or 'arc',
218
- metric=self.metric_list[0]['name'],
219
- score=weighted_avg_acc,
220
- category=[category_d],
221
- total_num=total_num)
222
-
223
- return res_map
224
-
225
149
  @classmethod
226
150
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
227
151
 
@@ -5,18 +5,17 @@ import os
5
5
  import random
6
6
  import re
7
7
 
8
- from evalscope.benchmarks.data_adapter import DataAdapter
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics.metrics import exact_match, weighted_mean
11
- from evalscope.utils import ResponseParser, normalize_score
10
+ from evalscope.metrics import WeightedAverageAccuracy, exact_match
11
+ from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
+ from evalscope.utils import ResponseParser
12
13
  from evalscope.utils.logger import get_logger
13
14
 
14
15
  # flake8: noqa
15
16
 
16
17
  logger = get_logger()
17
18
 
18
- DATASET_ID = 'modelscope/bbh'
19
-
20
19
  # BBH multiple choice subset list
21
20
  MULTIPLE_CHOICE = 'multiple_choice'
22
21
  MULTIPLE_CHOICE_LIST = [
@@ -59,41 +58,32 @@ TASK_TYPE = 'task_type'
59
58
  SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
60
59
 
61
60
 
61
+ @Benchmark.register(
62
+ name='bbh',
63
+ dataset_id='modelscope/bbh',
64
+ model_adapter=ChatGenerationModelAdapter,
65
+ subset_list=SUBSET_LIST,
66
+ metric_list=[WeightedAverageAccuracy],
67
+ few_shot_num=3,
68
+ train_split=None,
69
+ eval_split='test',
70
+ prompt_template='',
71
+ )
62
72
  class BBHAdapter(DataAdapter):
63
73
  """
64
74
  Adapter for BBH free-form and multiple-choices sub-tasks.
65
75
  """
66
76
 
67
- def __init__(self,
68
- subset_list: list = None,
69
- metric_list: list = None,
70
- few_shot_num: int = None,
71
- train_split: str = None,
72
- eval_split: str = 'test',
73
- **kwargs):
74
-
75
- if subset_list is None:
76
- subset_list = SUBSET_LIST
77
+ def __init__(self, **kwargs):
77
78
 
78
- if metric_list is None:
79
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
80
-
81
- if few_shot_num is None:
82
- logger.info(f'Set 3-shot examples by system for BBH.')
83
- few_shot_num = 3
79
+ few_shot_num = kwargs.get('few_shot_num', 3)
84
80
 
85
81
  if few_shot_num != 3 and few_shot_num != 0:
86
82
  logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
87
83
  f'Use 3-shot by default.')
88
- few_shot_num = 3
84
+ kwargs['few_shot_num'] = 3
89
85
 
90
- super().__init__(
91
- subset_list=subset_list,
92
- metric_list=metric_list,
93
- few_shot_num=few_shot_num,
94
- train_split=train_split,
95
- eval_split=eval_split,
96
- **kwargs)
86
+ super().__init__(**kwargs)
97
87
 
98
88
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
99
89
  data_dict = {}
@@ -217,66 +207,6 @@ class BBHAdapter(DataAdapter):
217
207
  def match(self, gold: str, pred: str) -> float:
218
208
  return exact_match(gold=gold, pred=pred)
219
209
 
220
- def compute_metric(self, review_res_list: list) -> float:
221
- """
222
- Compute evaluation result by specific metric.
223
-
224
- Args:
225
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
226
-
227
- Returns:
228
- The metric score.
229
- """
230
- items = [(score, 1.0) for score in review_res_list]
231
- return weighted_mean(items)
232
-
233
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
234
- """
235
- Generate the report for the model output.
236
-
237
- Args:
238
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
239
- report_name: The user-defined report name.
240
-
241
- Returns: A dict of metric calculation results. The format is like:
242
- {
243
- "name":"BBH",
244
- "metric":"WeightedAverageAccuracy",
245
- "score":0.3389,
246
- "category":[
247
- {
248
- "name":"DEFAULT",
249
- "score":0.3389,
250
- "subset":[
251
- {
252
- "name":"BBH",
253
- "score":0.3389
254
- },
255
- ]
256
- }
257
- ],
258
- "total_num":100
259
- }
260
- """
261
- total_num: int = sum([num for _, num in subset_score_map.values()])
262
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
263
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
264
- cate_avg_list = [{
265
- 'name': subset_name,
266
- 'score': normalize_score(score=score)
267
- } for subset_name, (score, _) in subset_score_map.items()]
268
-
269
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
270
-
271
- res_map = dict(
272
- name=report_name or 'bbh',
273
- metric=self.metric_list[0]['name'],
274
- score=weighted_avg_acc,
275
- category=[category_d],
276
- total_num=total_num)
277
-
278
- return res_map
279
-
280
210
  @classmethod
281
211
  def _extract_mc_answer(cls, ans: str) -> str:
282
212
  """