evalscope 0.5.5__tar.gz → 0.5.5rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (204) hide show
  1. {evalscope-0.5.5 → evalscope-0.5.5rc1}/PKG-INFO +30 -24
  2. {evalscope-0.5.5 → evalscope-0.5.5rc1}/README.md +25 -14
  3. evalscope-0.5.5rc1/evalscope/backend/opencompass/__init__.py +3 -0
  4. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/vlm_eval_kit/backend_manager.py +1 -0
  5. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
  6. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/evaluator.py +0 -1
  7. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/run.py +0 -4
  8. evalscope-0.5.5rc1/evalscope/utils/logger.py +64 -0
  9. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/task_utils.py +0 -3
  10. evalscope-0.5.5rc1/evalscope/version.py +4 -0
  11. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/PKG-INFO +30 -24
  12. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/SOURCES.txt +0 -16
  13. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/requires.txt +4 -10
  14. evalscope-0.5.5/evalscope/backend/rag_eval/__init__.py +0 -3
  15. evalscope-0.5.5/evalscope/backend/rag_eval/backend_manager.py +0 -68
  16. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/__init__.py +0 -4
  17. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/arguments.py +0 -59
  18. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/base.py +0 -89
  19. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/task_template.py +0 -83
  20. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -302
  21. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -252
  22. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -113
  23. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -153
  24. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -345
  25. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -302
  26. evalscope-0.5.5/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -64
  27. evalscope-0.5.5/evalscope/backend/rag_eval/ragas/__init__.py +0 -2
  28. evalscope-0.5.5/evalscope/backend/rag_eval/ragas/arguments.py +0 -37
  29. evalscope-0.5.5/evalscope/backend/rag_eval/ragas/task_template.py +0 -117
  30. evalscope-0.5.5/evalscope/preprocess/tokenizers/__init__.py +0 -0
  31. evalscope-0.5.5/evalscope/utils/logger.py +0 -94
  32. evalscope-0.5.5/evalscope/version.py +0 -4
  33. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/__init__.py +0 -0
  34. {evalscope-0.5.5/evalscope/backend/opencompass → evalscope-0.5.5rc1/evalscope/backend}/__init__.py +0 -0
  35. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/base.py +0 -0
  36. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  37. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/backend_manager.py +0 -0
  38. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  39. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  40. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  41. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  42. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/__init__.py +0 -0
  43. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/arc/__init__.py +0 -0
  44. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  45. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  46. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  47. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  48. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  49. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  50. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  51. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  52. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  53. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  54. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  55. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  56. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  57. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  58. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  59. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  60. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  61. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  62. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  63. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  64. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  65. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  66. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  67. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  68. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  69. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  70. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  71. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  72. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  73. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  74. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  75. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/benchmark.py +0 -0
  76. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  77. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  78. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  79. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  80. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  81. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  82. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  83. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  84. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  85. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/data_adapter.py +0 -0
  86. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  87. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -0
  88. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  89. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  90. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  91. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  92. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  93. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  94. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  95. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  96. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  97. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  98. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  99. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  100. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/race/__init__.py +0 -0
  101. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/race/race.py +0 -0
  102. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/race/race_adapter.py +0 -0
  103. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  104. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  105. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  106. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  107. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  108. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  109. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cache.py +0 -0
  110. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/__init__.py +0 -0
  111. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/base.py +0 -0
  112. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/cli.py +0 -0
  113. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/start_perf.py +0 -0
  114. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/cli/start_server.py +0 -0
  115. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/config.py +0 -0
  116. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/constants.py +0 -0
  117. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/__init__.py +0 -0
  118. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/rating_eval.py +0 -0
  119. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/reviewer/__init__.py +0 -0
  120. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  121. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/__init__.py +0 -0
  122. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  123. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  124. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/code_metric.py +0 -0
  125. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/math_accuracy.py +0 -0
  126. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/metrics.py +0 -0
  127. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/metrics/rouge_metric.py +0 -0
  128. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/__init__.py +0 -0
  129. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/api/__init__.py +0 -0
  130. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/api/openai_api.py +0 -0
  131. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/custom/__init__.py +0 -0
  132. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/custom/custom_model.py +0 -0
  133. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/dummy_chat_model.py +0 -0
  134. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/model.py +0 -0
  135. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/model_adapter.py +0 -0
  136. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/openai_model.py +0 -0
  137. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/models/template.py +0 -0
  138. {evalscope-0.5.5/evalscope/backend → evalscope-0.5.5rc1/evalscope/perf}/__init__.py +0 -0
  139. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/_logging.py +0 -0
  140. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/api_plugin_base.py +0 -0
  141. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/custom_api.py +0 -0
  142. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/dashscope_api.py +0 -0
  143. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/dataset_plugin_base.py +0 -0
  144. {evalscope-0.5.5/evalscope/perf → evalscope-0.5.5rc1/evalscope/perf/datasets}/__init__.py +0 -0
  145. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/datasets/line_by_line.py +0 -0
  146. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/datasets/longalpaca_12k.py +0 -0
  147. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/datasets/openqa.py +0 -0
  148. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/how_to_analysis_result.py +0 -0
  149. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/http_client.py +0 -0
  150. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/openai_api.py +0 -0
  151. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/plugin_registry.py +0 -0
  152. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/query_parameters.py +0 -0
  153. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/perf/server_sent_event.py +0 -0
  154. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/preprocess/__init__.py +0 -0
  155. {evalscope-0.5.5/evalscope/perf/datasets → evalscope-0.5.5rc1/evalscope/preprocess/tokenizers}/__init__.py +0 -0
  156. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -0
  157. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/__init__.py +0 -0
  158. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/arc.yaml +0 -0
  159. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/bbh.yaml +0 -0
  160. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  161. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/ceval.yaml +0 -0
  162. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  163. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  164. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  165. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/general_qa.yaml +0 -0
  166. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  167. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/mmlu.yaml +0 -0
  168. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  169. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/run_arena.py +0 -0
  170. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/run_ms.py +0 -0
  171. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/summarizer.py +0 -0
  172. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/__init__.py +0 -0
  173. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  174. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/eval.py +0 -0
  175. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/infer.py +0 -0
  176. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  177. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  178. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  179. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  180. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  181. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  182. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  183. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  184. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/longbench_write/utils.py +0 -0
  185. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  186. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  187. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  188. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  189. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  190. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  191. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/__init__.py +0 -0
  192. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/combine_reports.py +0 -0
  193. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/gen_mmlu_subject_mapping.py +0 -0
  194. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/tools/rewrite_eval_results.py +0 -0
  195. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/__init__.py +0 -0
  196. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/arena_utils.py +0 -0
  197. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/completion_parsers.py +0 -0
  198. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/task_cfg_parser.py +0 -0
  199. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope/utils/utils.py +0 -0
  200. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/dependency_links.txt +0 -0
  201. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/entry_points.txt +0 -0
  202. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/not-zip-safe +0 -0
  203. {evalscope-0.5.5 → evalscope-0.5.5rc1}/evalscope.egg-info/top_level.txt +0 -0
  204. {evalscope-0.5.5 → evalscope-0.5.5rc1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.5.5
3
+ Version: 0.5.5rc1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -24,7 +24,7 @@ Requires-Dist: editdistance
24
24
  Requires-Dist: jsonlines
25
25
  Requires-Dist: matplotlib
26
26
  Requires-Dist: modelscope[framework]
27
- Requires-Dist: nltk>=3.9
27
+ Requires-Dist: nltk
28
28
  Requires-Dist: openai
29
29
  Requires-Dist: pandas
30
30
  Requires-Dist: plotly
@@ -34,7 +34,7 @@ Requires-Dist: pyyaml
34
34
  Requires-Dist: regex
35
35
  Requires-Dist: requests
36
36
  Requires-Dist: requests-toolbelt
37
- Requires-Dist: rouge-score>=0.1.0
37
+ Requires-Dist: rouge-score
38
38
  Requires-Dist: sacrebleu
39
39
  Requires-Dist: scikit-learn
40
40
  Requires-Dist: seaborn
@@ -51,9 +51,6 @@ Provides-Extra: opencompass
51
51
  Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
52
52
  Provides-Extra: vlmeval
53
53
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
54
- Provides-Extra: rag
55
- Requires-Dist: ragas; extra == "rag"
56
- Requires-Dist: mteb>=0.14.16; extra == "rag"
57
54
  Provides-Extra: inner
58
55
  Requires-Dist: absl-py; extra == "inner"
59
56
  Requires-Dist: accelerate; extra == "inner"
@@ -91,7 +88,7 @@ Requires-Dist: editdistance; extra == "all"
91
88
  Requires-Dist: jsonlines; extra == "all"
92
89
  Requires-Dist: matplotlib; extra == "all"
93
90
  Requires-Dist: modelscope[framework]; extra == "all"
94
- Requires-Dist: nltk>=3.9; extra == "all"
91
+ Requires-Dist: nltk; extra == "all"
95
92
  Requires-Dist: openai; extra == "all"
96
93
  Requires-Dist: pandas; extra == "all"
97
94
  Requires-Dist: plotly; extra == "all"
@@ -101,7 +98,7 @@ Requires-Dist: pyyaml; extra == "all"
101
98
  Requires-Dist: regex; extra == "all"
102
99
  Requires-Dist: requests; extra == "all"
103
100
  Requires-Dist: requests-toolbelt; extra == "all"
104
- Requires-Dist: rouge-score>=0.1.0; extra == "all"
101
+ Requires-Dist: rouge-score; extra == "all"
105
102
  Requires-Dist: sacrebleu; extra == "all"
106
103
  Requires-Dist: scikit-learn; extra == "all"
107
104
  Requires-Dist: seaborn; extra == "all"
@@ -116,8 +113,6 @@ Requires-Dist: jieba; extra == "all"
116
113
  Requires-Dist: rouge-chinese; extra == "all"
117
114
  Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
118
115
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
119
- Requires-Dist: ragas; extra == "all"
120
- Requires-Dist: mteb>=0.14.16; extra == "all"
121
116
 
122
117
  English | [简体中文](README_zh.md)
123
118
 
@@ -150,11 +145,30 @@ English | [简体中文](README_zh.md)
150
145
 
151
146
  ## 📝 Introduction
152
147
 
153
- EvalScope is the official model evaluation and performance benchmarking framework launched by the [ModelScope](https://modelscope.cn/) community. It comes with built-in common benchmarks and evaluation metrics, such as MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, and HumanEval. EvalScope supports various types of model evaluations, including LLMs, multimodal LLMs, embedding models, and reranker models. It is also applicable to multiple evaluation scenarios, such as end-to-end RAG evaluation, arena mode, and model inference performance stress testing. Moreover, with the seamless integration of the ms-swift training framework, evaluations can be initiated with a single click, providing full end-to-end support from model training to evaluation 🚀
148
+ Large Model (including Large Language Models, Multi-modal Large Language Models) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework.
149
+
150
+ ### Framework Features
151
+ - **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
152
+ - **Evaluation Metrics**: Implements various commonly used evaluation metrics.
153
+ - **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
154
+ - **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
155
+ - **Evaluation Reports**: Automatically generates evaluation reports.
156
+ - **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
157
+ - **Single mode**: Scoring a single model.
158
+ - **Pairwise-baseline mode**: Comparing against a baseline model.
159
+ - **Pairwise (all) mode**: Pairwise comparison among all models.
160
+ - **Visualization Tools**: Provides intuitive displays of evaluation results.
161
+ - **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
162
+ - **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
163
+ - **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
164
+ - **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
165
+
166
+
167
+ <details><summary>Overall Architecture</summary>
154
168
 
155
169
  <p align="center">
156
170
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
157
- <br>EvalScope Framework.
171
+ <br>Fig 1. EvalScope Framework.
158
172
  </p>
159
173
 
160
174
  The architecture includes the following modules:
@@ -164,15 +178,14 @@ The architecture includes the following modules:
164
178
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
165
179
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
166
180
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
167
- - **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
168
181
  - **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
169
182
  4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
170
183
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
171
184
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
185
+ </details>
172
186
 
173
187
 
174
188
  ## 🎉 News
175
- - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
176
189
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
177
190
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
178
191
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -343,10 +356,9 @@ run_task(task_cfg=your_task_cfg)
343
356
  ## Evaluation Backend
344
357
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
345
358
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
346
- - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
347
- - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html)
348
- - **RAGEval**: Initiate RAG evaluation tasks through EvalScope, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html): [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
349
- - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
359
+ - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
360
+ - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
361
+ - **ThirdParty**: The third-party task, e.g. [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html), you can contribute your own evaluation task to EvalScope as third-party backend.
350
362
 
351
363
  ## Custom Dataset Evaluation
352
364
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
@@ -375,8 +387,6 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
375
387
 
376
388
 
377
389
  ## TO-DO List
378
- - [x] RAG evaluation
379
- - [x] VLM evaluation
380
390
  - [x] Agents evaluation
381
391
  - [x] vLLM
382
392
  - [ ] Distributed evaluating
@@ -388,7 +398,3 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
388
398
  - [ ] Auto-reviewer
389
399
  - [ ] Qwen-max
390
400
 
391
-
392
- ## Star History
393
-
394
- [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -29,11 +29,30 @@ English | [简体中文](README_zh.md)
29
29
 
30
30
  ## 📝 Introduction
31
31
 
32
- EvalScope is the official model evaluation and performance benchmarking framework launched by the [ModelScope](https://modelscope.cn/) community. It comes with built-in common benchmarks and evaluation metrics, such as MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, and HumanEval. EvalScope supports various types of model evaluations, including LLMs, multimodal LLMs, embedding models, and reranker models. It is also applicable to multiple evaluation scenarios, such as end-to-end RAG evaluation, arena mode, and model inference performance stress testing. Moreover, with the seamless integration of the ms-swift training framework, evaluations can be initiated with a single click, providing full end-to-end support from model training to evaluation 🚀
32
+ Large Model (including Large Language Models, Multi-modal Large Language Models) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework.
33
+
34
+ ### Framework Features
35
+ - **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
36
+ - **Evaluation Metrics**: Implements various commonly used evaluation metrics.
37
+ - **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
38
+ - **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
39
+ - **Evaluation Reports**: Automatically generates evaluation reports.
40
+ - **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
41
+ - **Single mode**: Scoring a single model.
42
+ - **Pairwise-baseline mode**: Comparing against a baseline model.
43
+ - **Pairwise (all) mode**: Pairwise comparison among all models.
44
+ - **Visualization Tools**: Provides intuitive displays of evaluation results.
45
+ - **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
46
+ - **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
47
+ - **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
48
+ - **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
49
+
50
+
51
+ <details><summary>Overall Architecture</summary>
33
52
 
34
53
  <p align="center">
35
54
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
36
- <br>EvalScope Framework.
55
+ <br>Fig 1. EvalScope Framework.
37
56
  </p>
38
57
 
39
58
  The architecture includes the following modules:
@@ -43,15 +62,14 @@ The architecture includes the following modules:
43
62
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
44
63
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
45
64
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
46
- - **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
47
65
  - **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
48
66
  4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
49
67
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
50
68
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
69
+ </details>
51
70
 
52
71
 
53
72
  ## 🎉 News
54
- - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
55
73
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
56
74
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
57
75
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -222,10 +240,9 @@ run_task(task_cfg=your_task_cfg)
222
240
  ## Evaluation Backend
223
241
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
224
242
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
225
- - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
226
- - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html)
227
- - **RAGEval**: Initiate RAG evaluation tasks through EvalScope, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html): [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
228
- - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
243
+ - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
244
+ - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
245
+ - **ThirdParty**: The third-party task, e.g. [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html), you can contribute your own evaluation task to EvalScope as third-party backend.
229
246
 
230
247
  ## Custom Dataset Evaluation
231
248
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
@@ -254,8 +271,6 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
254
271
 
255
272
 
256
273
  ## TO-DO List
257
- - [x] RAG evaluation
258
- - [x] VLM evaluation
259
274
  - [x] Agents evaluation
260
275
  - [x] vLLM
261
276
  - [ ] Distributed evaluating
@@ -267,7 +282,3 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
267
282
  - [ ] Auto-reviewer
268
283
  - [ ] Qwen-max
269
284
 
270
-
271
- ## Star History
272
-
273
- [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.backend.opencompass.backend_manager import OpenCompassBackendManager
@@ -4,6 +4,7 @@ from evalscope.backend.base import BackendManager
4
4
  from evalscope.utils.logger import get_logger
5
5
  from functools import partial
6
6
  import subprocess
7
+ from dataclasses import dataclass
7
8
  import copy
8
9
 
9
10
  logger = get_logger()
@@ -8,7 +8,7 @@ class CustomDataset:
8
8
 
9
9
  def load_data(self, dataset):
10
10
  # customize the loading of the dataset
11
- data_path = os.path.join(os.path.expanduser("~/LMUData"), f'{dataset}.tsv')
11
+ data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
12
12
  return load(data_path)
13
13
 
14
14
 
@@ -174,7 +174,6 @@ class Evaluator(object):
174
174
  """
175
175
  assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
176
176
  assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
177
- assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
178
177
 
179
178
  answers_list = []
180
179
  pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
@@ -207,10 +207,6 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
207
207
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
208
208
  vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
209
209
  vlm_eval_kit_backend_manager.run()
210
- elif eval_backend == EvalBackend.RAG_EVAL.value:
211
- from evalscope.backend.rag_eval import RAGEvalBackendManager
212
- rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
213
- rag_eval_backend_manager.run()
214
210
  # TODO: Add other evaluation backends
215
211
  elif eval_backend == EvalBackend.THIRD_PARTY.value:
216
212
  raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
@@ -0,0 +1,64 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ init_loggers = {}
7
+
8
+ formatter = logging.Formatter(
9
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
10
+
11
+
12
+ def get_logger(log_file: Optional[str] = None,
13
+ log_level: int = logging.INFO,
14
+ file_mode: str = 'w'):
15
+ """ Get logging logger
16
+
17
+ Args:
18
+ log_file: Log filename, if specified, file handler will be added to
19
+ logger
20
+ log_level: Logging level.
21
+ file_mode: Specifies the mode to open the file, if filename is
22
+ specified (if filemode is unspecified, it defaults to 'w').
23
+ """
24
+
25
+ logger_name = __name__.split('.')[0]
26
+ logger = logging.getLogger(logger_name)
27
+
28
+ if logger_name in init_loggers:
29
+ add_file_handler_if_needed(logger, log_file, file_mode, log_level)
30
+ return logger
31
+
32
+ for handler in logger.root.handlers:
33
+ if type(handler) is logging.StreamHandler:
34
+ handler.setLevel(logging.ERROR)
35
+
36
+ stream_handler = logging.StreamHandler()
37
+ handlers = [stream_handler]
38
+
39
+ if log_file is not None:
40
+ file_handler = logging.FileHandler(log_file, file_mode)
41
+ handlers.append(file_handler)
42
+
43
+ for handler in handlers:
44
+ handler.setFormatter(formatter)
45
+ handler.setLevel(log_level)
46
+ logger.addHandler(handler)
47
+
48
+ logger.setLevel(log_level)
49
+
50
+ init_loggers[logger_name] = True
51
+
52
+ return logger
53
+
54
+
55
+ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
56
+ for handler in logger.handlers:
57
+ if isinstance(handler, logging.FileHandler):
58
+ return
59
+
60
+ if log_file is not None:
61
+ file_handler = logging.FileHandler(log_file, file_mode)
62
+ file_handler.setFormatter(formatter)
63
+ file_handler.setLevel(log_level)
64
+ logger.addHandler(file_handler)
@@ -11,9 +11,6 @@ class EvalBackend(Enum):
11
11
 
12
12
  # Use VLM Eval Kit as the multi-modal model evaluation backend
13
13
  VLM_EVAL_KIT = 'VLMEvalKit'
14
-
15
- # Use RAGEval as the RAG evaluation backend
16
- RAG_EVAL = 'RAGEval'
17
14
 
18
15
  # Use third-party evaluation backend/modules
19
16
  THIRD_PARTY = 'ThirdParty'
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ __version__ = '0.5.5rc1'
4
+ __release_datetime__ = '2024-09-29 08:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.5.5
3
+ Version: 0.5.5rc1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -24,7 +24,7 @@ Requires-Dist: editdistance
24
24
  Requires-Dist: jsonlines
25
25
  Requires-Dist: matplotlib
26
26
  Requires-Dist: modelscope[framework]
27
- Requires-Dist: nltk>=3.9
27
+ Requires-Dist: nltk
28
28
  Requires-Dist: openai
29
29
  Requires-Dist: pandas
30
30
  Requires-Dist: plotly
@@ -34,7 +34,7 @@ Requires-Dist: pyyaml
34
34
  Requires-Dist: regex
35
35
  Requires-Dist: requests
36
36
  Requires-Dist: requests-toolbelt
37
- Requires-Dist: rouge-score>=0.1.0
37
+ Requires-Dist: rouge-score
38
38
  Requires-Dist: sacrebleu
39
39
  Requires-Dist: scikit-learn
40
40
  Requires-Dist: seaborn
@@ -51,9 +51,6 @@ Provides-Extra: opencompass
51
51
  Requires-Dist: ms-opencompass>=0.1.1; extra == "opencompass"
52
52
  Provides-Extra: vlmeval
53
53
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "vlmeval"
54
- Provides-Extra: rag
55
- Requires-Dist: ragas; extra == "rag"
56
- Requires-Dist: mteb>=0.14.16; extra == "rag"
57
54
  Provides-Extra: inner
58
55
  Requires-Dist: absl-py; extra == "inner"
59
56
  Requires-Dist: accelerate; extra == "inner"
@@ -91,7 +88,7 @@ Requires-Dist: editdistance; extra == "all"
91
88
  Requires-Dist: jsonlines; extra == "all"
92
89
  Requires-Dist: matplotlib; extra == "all"
93
90
  Requires-Dist: modelscope[framework]; extra == "all"
94
- Requires-Dist: nltk>=3.9; extra == "all"
91
+ Requires-Dist: nltk; extra == "all"
95
92
  Requires-Dist: openai; extra == "all"
96
93
  Requires-Dist: pandas; extra == "all"
97
94
  Requires-Dist: plotly; extra == "all"
@@ -101,7 +98,7 @@ Requires-Dist: pyyaml; extra == "all"
101
98
  Requires-Dist: regex; extra == "all"
102
99
  Requires-Dist: requests; extra == "all"
103
100
  Requires-Dist: requests-toolbelt; extra == "all"
104
- Requires-Dist: rouge-score>=0.1.0; extra == "all"
101
+ Requires-Dist: rouge-score; extra == "all"
105
102
  Requires-Dist: sacrebleu; extra == "all"
106
103
  Requires-Dist: scikit-learn; extra == "all"
107
104
  Requires-Dist: seaborn; extra == "all"
@@ -116,8 +113,6 @@ Requires-Dist: jieba; extra == "all"
116
113
  Requires-Dist: rouge-chinese; extra == "all"
117
114
  Requires-Dist: ms-opencompass>=0.1.1; extra == "all"
118
115
  Requires-Dist: ms-vlmeval>=0.0.5; extra == "all"
119
- Requires-Dist: ragas; extra == "all"
120
- Requires-Dist: mteb>=0.14.16; extra == "all"
121
116
 
122
117
  English | [简体中文](README_zh.md)
123
118
 
@@ -150,11 +145,30 @@ English | [简体中文](README_zh.md)
150
145
 
151
146
  ## 📝 Introduction
152
147
 
153
- EvalScope is the official model evaluation and performance benchmarking framework launched by the [ModelScope](https://modelscope.cn/) community. It comes with built-in common benchmarks and evaluation metrics, such as MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, and HumanEval. EvalScope supports various types of model evaluations, including LLMs, multimodal LLMs, embedding models, and reranker models. It is also applicable to multiple evaluation scenarios, such as end-to-end RAG evaluation, arena mode, and model inference performance stress testing. Moreover, with the seamless integration of the ms-swift training framework, evaluations can be initiated with a single click, providing full end-to-end support from model training to evaluation 🚀
148
+ Large Model (including Large Language Models, Multi-modal Large Language Models) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework.
149
+
150
+ ### Framework Features
151
+ - **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
152
+ - **Evaluation Metrics**: Implements various commonly used evaluation metrics.
153
+ - **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
154
+ - **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
155
+ - **Evaluation Reports**: Automatically generates evaluation reports.
156
+ - **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
157
+ - **Single mode**: Scoring a single model.
158
+ - **Pairwise-baseline mode**: Comparing against a baseline model.
159
+ - **Pairwise (all) mode**: Pairwise comparison among all models.
160
+ - **Visualization Tools**: Provides intuitive displays of evaluation results.
161
+ - **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
162
+ - **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
163
+ - **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
164
+ - **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
165
+
166
+
167
+ <details><summary>Overall Architecture</summary>
154
168
 
155
169
  <p align="center">
156
170
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
157
- <br>EvalScope Framework.
171
+ <br>Fig 1. EvalScope Framework.
158
172
  </p>
159
173
 
160
174
  The architecture includes the following modules:
@@ -164,15 +178,14 @@ The architecture includes the following modules:
164
178
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
165
179
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
166
180
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
167
- - **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
168
181
  - **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
169
182
  4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
170
183
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
171
184
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
185
+ </details>
172
186
 
173
187
 
174
188
  ## 🎉 News
175
- - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
176
189
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
177
190
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
178
191
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -343,10 +356,9 @@ run_task(task_cfg=your_task_cfg)
343
356
  ## Evaluation Backend
344
357
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
345
358
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
346
- - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
347
- - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html)
348
- - **RAGEval**: Initiate RAG evaluation tasks through EvalScope, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html): [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
349
- - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
359
+ - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
360
+ - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
361
+ - **ThirdParty**: The third-party task, e.g. [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html), you can contribute your own evaluation task to EvalScope as third-party backend.
350
362
 
351
363
  ## Custom Dataset Evaluation
352
364
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
@@ -375,8 +387,6 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
375
387
 
376
388
 
377
389
  ## TO-DO List
378
- - [x] RAG evaluation
379
- - [x] VLM evaluation
380
390
  - [x] Agents evaluation
381
391
  - [x] vLLM
382
392
  - [ ] Distributed evaluating
@@ -388,7 +398,3 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
388
398
  - [ ] Auto-reviewer
389
399
  - [ ] Qwen-max
390
400
 
391
-
392
- ## Star History
393
-
394
- [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -23,22 +23,6 @@ evalscope/backend/opencompass/backend_manager.py
23
23
  evalscope/backend/opencompass/tasks/__init__.py
24
24
  evalscope/backend/opencompass/tasks/eval_api.py
25
25
  evalscope/backend/opencompass/tasks/eval_datasets.py
26
- evalscope/backend/rag_eval/__init__.py
27
- evalscope/backend/rag_eval/backend_manager.py
28
- evalscope/backend/rag_eval/cmteb/__init__.py
29
- evalscope/backend/rag_eval/cmteb/arguments.py
30
- evalscope/backend/rag_eval/cmteb/base.py
31
- evalscope/backend/rag_eval/cmteb/task_template.py
32
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py
33
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py
34
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py
35
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py
36
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py
37
- evalscope/backend/rag_eval/cmteb/tasks/STS.py
38
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py
39
- evalscope/backend/rag_eval/ragas/__init__.py
40
- evalscope/backend/rag_eval/ragas/arguments.py
41
- evalscope/backend/rag_eval/ragas/task_template.py
42
26
  evalscope/backend/vlm_eval_kit/__init__.py
43
27
  evalscope/backend/vlm_eval_kit/backend_manager.py
44
28
  evalscope/backend/vlm_eval_kit/custom_dataset.py
@@ -7,7 +7,7 @@ editdistance
7
7
  jsonlines
8
8
  matplotlib
9
9
  modelscope[framework]
10
- nltk>=3.9
10
+ nltk
11
11
  openai
12
12
  pandas
13
13
  plotly
@@ -17,7 +17,7 @@ pyyaml
17
17
  regex
18
18
  requests
19
19
  requests-toolbelt
20
- rouge-score>=0.1.0
20
+ rouge-score
21
21
  sacrebleu
22
22
  scikit-learn
23
23
  seaborn
@@ -41,7 +41,7 @@ editdistance
41
41
  jsonlines
42
42
  matplotlib
43
43
  modelscope[framework]
44
- nltk>=3.9
44
+ nltk
45
45
  openai
46
46
  pandas
47
47
  plotly
@@ -51,7 +51,7 @@ pyyaml
51
51
  regex
52
52
  requests
53
53
  requests-toolbelt
54
- rouge-score>=0.1.0
54
+ rouge-score
55
55
  sacrebleu
56
56
  scikit-learn
57
57
  seaborn
@@ -66,8 +66,6 @@ jieba
66
66
  rouge-chinese
67
67
  ms-opencompass>=0.1.1
68
68
  ms-vlmeval>=0.0.5
69
- ragas
70
- mteb>=0.14.16
71
69
 
72
70
  [inner]
73
71
  absl-py
@@ -99,9 +97,5 @@ transformers_stream_generator
99
97
  [opencompass]
100
98
  ms-opencompass>=0.1.1
101
99
 
102
- [rag]
103
- ragas
104
- mteb>=0.14.16
105
-
106
100
  [vlmeval]
107
101
  ms-vlmeval>=0.0.5
@@ -1,3 +0,0 @@
1
- from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
2
- from evalscope.backend.rag_eval.utils.llm import LLM
3
- from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager