evalscope 0.8.2__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (331) hide show
  1. {evalscope-0.8.2/evalscope.egg-info → evalscope-0.10.0}/PKG-INFO +115 -21
  2. {evalscope-0.8.2 → evalscope-0.10.0}/README.md +109 -20
  3. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/__init__.py +2 -0
  4. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/arguments.py +11 -3
  5. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  6. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/utils/llm.py +1 -1
  7. evalscope-0.10.0/evalscope/benchmarks/__init__.py +23 -0
  8. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/arc/arc_adapter.py +24 -102
  9. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope-0.10.0/evalscope/benchmarks/benchmark.py +76 -0
  11. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  12. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  13. evalscope-0.10.0/evalscope/benchmarks/competition_math/competition_math_adapter.py +126 -0
  14. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/data_adapter.py +115 -87
  15. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  16. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  17. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  18. evalscope-0.10.0/evalscope/benchmarks/humaneval/humaneval_adapter.py +104 -0
  19. evalscope-0.10.0/evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  20. evalscope-0.10.0/evalscope/benchmarks/ifeval/instructions.py +1478 -0
  21. evalscope-0.10.0/evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  22. evalscope-0.10.0/evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  23. evalscope-0.10.0/evalscope/benchmarks/ifeval/utils.py +134 -0
  24. evalscope-0.10.0/evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  25. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  26. evalscope-0.10.0/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  27. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/race/race_adapter.py +26 -123
  28. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  29. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  30. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/cli/cli.py +2 -0
  31. evalscope-0.10.0/evalscope/cli/start_app.py +29 -0
  32. evalscope-0.10.0/evalscope/collections/__init__.py +3 -0
  33. evalscope-0.10.0/evalscope/collections/evaluator.py +198 -0
  34. evalscope-0.10.0/evalscope/collections/sampler.py +138 -0
  35. evalscope-0.10.0/evalscope/collections/schema.py +126 -0
  36. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/config.py +7 -5
  37. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/constants.py +9 -26
  38. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/evaluator/evaluator.py +87 -121
  39. evalscope-0.10.0/evalscope/evaluator/reviewer/__init__.py +1 -0
  40. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  41. evalscope-0.10.0/evalscope/metrics/__init__.py +4 -0
  42. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  43. evalscope-0.10.0/evalscope/metrics/math_accuracy.py +200 -0
  44. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/metrics/metrics.py +18 -6
  45. evalscope-0.10.0/evalscope/metrics/named_metrics.py +17 -0
  46. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/metrics/rouge_metric.py +13 -8
  47. evalscope-0.10.0/evalscope/models/__init__.py +16 -0
  48. evalscope-0.10.0/evalscope/models/base_adapter.py +52 -0
  49. evalscope-0.10.0/evalscope/models/chat_adapter.py +138 -0
  50. evalscope-0.10.0/evalscope/models/choice_adapter.py +211 -0
  51. evalscope-0.10.0/evalscope/models/custom_adapter.py +67 -0
  52. evalscope-0.10.0/evalscope/models/local_model.py +74 -0
  53. evalscope-0.10.0/evalscope/models/model.py +229 -0
  54. evalscope-0.10.0/evalscope/models/server_adapter.py +111 -0
  55. evalscope-0.10.0/evalscope/perf/__init__.py +1 -0
  56. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/main.py +0 -1
  57. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/api/custom_api.py +1 -1
  58. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/api/openai_api.py +1 -1
  59. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  60. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  61. evalscope-0.10.0/evalscope/perf/utils/__init__.py +0 -0
  62. evalscope-0.10.0/evalscope/registry/__init__.py +1 -0
  63. evalscope-0.10.0/evalscope/report/__init__.py +5 -0
  64. evalscope-0.10.0/evalscope/report/app.py +506 -0
  65. evalscope-0.10.0/evalscope/report/combinator.py +73 -0
  66. evalscope-0.10.0/evalscope/report/generator.py +80 -0
  67. evalscope-0.10.0/evalscope/report/utils.py +133 -0
  68. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/run.py +48 -72
  69. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/run_arena.py +1 -1
  70. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/summarizer.py +1 -1
  71. evalscope-0.10.0/evalscope/third_party/__init__.py +1 -0
  72. evalscope-0.10.0/evalscope/third_party/longbench_write/resources/__init__.py +1 -0
  73. evalscope-0.10.0/evalscope/third_party/longbench_write/tools/__init__.py +1 -0
  74. evalscope-0.10.0/evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  75. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/utils/__init__.py +1 -1
  76. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/utils/chat_service.py +5 -4
  77. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/utils/io_utils.py +8 -0
  78. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/utils/logger.py +5 -0
  79. evalscope-0.10.0/evalscope/utils/model_utils.py +24 -0
  80. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/utils/utils.py +3 -25
  81. evalscope-0.10.0/evalscope/version.py +4 -0
  82. {evalscope-0.8.2 → evalscope-0.10.0/evalscope.egg-info}/PKG-INFO +115 -21
  83. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope.egg-info/SOURCES.txt +31 -9
  84. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope.egg-info/requires.txt +6 -0
  85. evalscope-0.10.0/requirements/app.txt +2 -0
  86. {evalscope-0.8.2 → evalscope-0.10.0}/setup.py +2 -0
  87. evalscope-0.10.0/tests/__init__.py +1 -0
  88. evalscope-0.10.0/tests/cli/__init__.py +1 -0
  89. evalscope-0.10.0/tests/cli/test_collection.py +57 -0
  90. {evalscope-0.8.2 → evalscope-0.10.0}/tests/cli/test_run.py +52 -1
  91. evalscope-0.10.0/tests/perf/__init__.py +1 -0
  92. evalscope-0.10.0/tests/rag/__init__.py +0 -0
  93. {evalscope-0.8.2 → evalscope-0.10.0}/tests/rag/test_mteb.py +3 -2
  94. evalscope-0.10.0/tests/swift/__init__.py +1 -0
  95. evalscope-0.10.0/tests/vlm/__init__.py +1 -0
  96. evalscope-0.8.2/evalscope/benchmarks/__init__.py +0 -4
  97. evalscope-0.8.2/evalscope/benchmarks/arc/__init__.py +0 -6
  98. evalscope-0.8.2/evalscope/benchmarks/bbh/__init__.py +0 -5
  99. evalscope-0.8.2/evalscope/benchmarks/benchmark.py +0 -65
  100. evalscope-0.8.2/evalscope/benchmarks/ceval/__init__.py +0 -6
  101. evalscope-0.8.2/evalscope/benchmarks/cmmlu/__init__.py +0 -6
  102. evalscope-0.8.2/evalscope/benchmarks/competition_math/__init__.py +0 -6
  103. evalscope-0.8.2/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -468
  104. evalscope-0.8.2/evalscope/benchmarks/general_qa/__init__.py +0 -6
  105. evalscope-0.8.2/evalscope/benchmarks/gsm8k/__init__.py +0 -5
  106. evalscope-0.8.2/evalscope/benchmarks/hellaswag/__init__.py +0 -6
  107. evalscope-0.8.2/evalscope/benchmarks/humaneval/__init__.py +0 -5
  108. evalscope-0.8.2/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -206
  109. evalscope-0.8.2/evalscope/benchmarks/mmlu/__init__.py +0 -6
  110. evalscope-0.8.2/evalscope/benchmarks/race/__init__.py +0 -6
  111. evalscope-0.8.2/evalscope/benchmarks/trivia_qa/__init__.py +0 -6
  112. evalscope-0.8.2/evalscope/benchmarks/truthful_qa/__init__.py +0 -6
  113. evalscope-0.8.2/evalscope/metrics/math_accuracy.py +0 -57
  114. evalscope-0.8.2/evalscope/models/__init__.py +0 -3
  115. evalscope-0.8.2/evalscope/models/api/__init__.py +0 -3
  116. evalscope-0.8.2/evalscope/models/dummy_chat_model.py +0 -49
  117. evalscope-0.8.2/evalscope/models/model.py +0 -88
  118. evalscope-0.8.2/evalscope/models/model_adapter.py +0 -525
  119. evalscope-0.8.2/evalscope/models/openai_model.py +0 -103
  120. evalscope-0.8.2/evalscope/tools/combine_reports.py +0 -133
  121. evalscope-0.8.2/evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  122. evalscope-0.8.2/evalscope/utils/model_utils.py +0 -11
  123. evalscope-0.8.2/evalscope/version.py +0 -4
  124. {evalscope-0.8.2 → evalscope-0.10.0}/LICENSE +0 -0
  125. {evalscope-0.8.2 → evalscope-0.10.0}/MANIFEST.in +0 -0
  126. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/__init__.py +0 -0
  127. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/base.py +0 -0
  128. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/opencompass/__init__.py +0 -0
  129. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  130. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  131. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  132. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  133. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  134. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  135. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  136. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  137. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  138. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  139. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  140. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  141. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  142. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  143. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  144. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  145. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  146. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  147. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  148. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  149. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  150. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  151. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  152. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  153. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  154. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  155. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  156. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  157. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  158. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  159. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  160. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  161. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  162. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  163. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  164. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  165. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  166. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  167. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  168. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  169. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  170. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  171. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  172. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  173. {evalscope-0.8.2/evalscope/cli → evalscope-0.10.0/evalscope/benchmarks/arc}/__init__.py +0 -0
  174. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  175. {evalscope-0.8.2/evalscope/evaluator/reviewer → evalscope-0.10.0/evalscope/benchmarks/bbh}/__init__.py +0 -0
  176. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  177. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  178. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  179. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  180. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  181. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  182. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  183. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  184. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  185. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  186. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  187. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  188. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  189. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  190. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  191. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  192. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  193. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  194. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  195. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  196. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  197. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  198. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  199. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  200. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  201. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  202. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  203. {evalscope-0.8.2/evalscope/metrics → evalscope-0.10.0/evalscope/benchmarks/ceval}/__init__.py +0 -0
  204. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  205. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
  206. {evalscope-0.8.2/evalscope/registry → evalscope-0.10.0/evalscope/benchmarks/cmmlu}/__init__.py +0 -0
  207. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  208. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  209. {evalscope-0.8.2/evalscope/third_party → evalscope-0.10.0/evalscope/benchmarks/competition_math}/__init__.py +0 -0
  210. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  211. {evalscope-0.8.2/evalscope/third_party/longbench_write/resources → evalscope-0.10.0/evalscope/benchmarks/general_qa}/__init__.py +0 -0
  212. {evalscope-0.8.2/evalscope/third_party/longbench_write/tools → evalscope-0.10.0/evalscope/benchmarks/gsm8k}/__init__.py +0 -0
  213. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  214. {evalscope-0.8.2/evalscope/third_party/toolbench_static/llm → evalscope-0.10.0/evalscope/benchmarks/hellaswag}/__init__.py +0 -0
  215. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  216. {evalscope-0.8.2/evalscope/tools → evalscope-0.10.0/evalscope/benchmarks/humaneval}/__init__.py +0 -0
  217. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  218. {evalscope-0.8.2/evalscope/perf → evalscope-0.10.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  219. {evalscope-0.8.2/evalscope/perf/utils → evalscope-0.10.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  220. {evalscope-0.8.2/tests → evalscope-0.10.0/evalscope/benchmarks/mmlu}/__init__.py +0 -0
  221. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  222. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  223. {evalscope-0.8.2/tests/rag → evalscope-0.10.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  224. {evalscope-0.8.2/tests/cli → evalscope-0.10.0/evalscope/benchmarks/race}/__init__.py +0 -0
  225. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/race/race.py +0 -0
  226. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  227. {evalscope-0.8.2/tests/perf → evalscope-0.10.0/evalscope/benchmarks/trivia_qa}/__init__.py +0 -0
  228. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  229. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  230. {evalscope-0.8.2/tests/swift → evalscope-0.10.0/evalscope/benchmarks/truthful_qa}/__init__.py +0 -0
  231. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  232. {evalscope-0.8.2/tests/vlm → evalscope-0.10.0/evalscope/cli}/__init__.py +0 -0
  233. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/cli/base.py +0 -0
  234. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/cli/start_eval.py +0 -0
  235. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/cli/start_perf.py +0 -0
  236. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/cli/start_server.py +0 -0
  237. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/evaluator/__init__.py +0 -0
  238. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/evaluator/rating_eval.py +0 -0
  239. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  240. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/metrics/code_metric.py +0 -0
  241. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  242. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  243. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/models/custom/__init__.py +0 -0
  244. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/models/custom/custom_model.py +0 -0
  245. /evalscope-0.8.2/evalscope/tools/rewrite_eval_results.py → /evalscope-0.10.0/evalscope/models/custom/dummy_model.py +0 -0
  246. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/arguments.py +0 -0
  247. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/benchmark.py +0 -0
  248. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/http_client.py +0 -0
  249. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/__init__.py +0 -0
  250. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  251. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/api/base.py +0 -0
  252. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  253. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  254. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  255. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  256. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  257. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  258. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  259. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/plugin/registry.py +0 -0
  260. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/utils/analysis_result.py +0 -0
  261. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/utils/benchmark_util.py +0 -0
  262. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/utils/db_util.py +0 -0
  263. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/utils/handler.py +0 -0
  264. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/perf/utils/local_server.py +0 -0
  265. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  266. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  267. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  268. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  269. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  270. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  271. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  272. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  273. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/data/question.jsonl +0 -0
  274. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/arc.yaml +0 -0
  275. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  276. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  277. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  278. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  279. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  280. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  281. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  282. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  283. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  284. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  285. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/README.md +0 -0
  286. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  287. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  288. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  289. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  290. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  291. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  292. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  293. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  294. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  295. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  296. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  297. {evalscope-0.8.2/evalscope/models/api → evalscope-0.10.0/evalscope/third_party/longbench_write/tools}/openai_api.py +0 -0
  298. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  299. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  300. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  301. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  302. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  303. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  304. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  305. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  306. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  307. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  308. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/utils/arena_utils.py +0 -0
  309. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope/utils/completion_parsers.py +0 -0
  310. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope.egg-info/dependency_links.txt +0 -0
  311. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope.egg-info/entry_points.txt +0 -0
  312. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope.egg-info/not-zip-safe +0 -0
  313. {evalscope-0.8.2 → evalscope-0.10.0}/evalscope.egg-info/top_level.txt +0 -0
  314. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/docs.txt +0 -0
  315. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/framework.txt +0 -0
  316. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/inner.txt +0 -0
  317. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/opencompass.txt +0 -0
  318. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/perf.txt +0 -0
  319. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/rag.txt +0 -0
  320. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/tests.txt +0 -0
  321. {evalscope-0.8.2 → evalscope-0.10.0}/requirements/vlmeval.txt +0 -0
  322. {evalscope-0.8.2 → evalscope-0.10.0}/requirements.txt +0 -0
  323. {evalscope-0.8.2 → evalscope-0.10.0}/setup.cfg +0 -0
  324. {evalscope-0.8.2 → evalscope-0.10.0}/tests/perf/test_perf.py +0 -0
  325. {evalscope-0.8.2 → evalscope-0.10.0}/tests/rag/test_clip_benchmark.py +0 -0
  326. {evalscope-0.8.2 → evalscope-0.10.0}/tests/rag/test_ragas.py +0 -0
  327. {evalscope-0.8.2 → evalscope-0.10.0}/tests/swift/test_run_swift_eval.py +0 -0
  328. {evalscope-0.8.2 → evalscope-0.10.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  329. {evalscope-0.8.2 → evalscope-0.10.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  330. {evalscope-0.8.2 → evalscope-0.10.0}/tests/test_run_all.py +0 -0
  331. {evalscope-0.8.2 → evalscope-0.10.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.2
3
+ Version: 0.10.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -63,6 +63,9 @@ Requires-Dist: numpy; extra == "perf"
63
63
  Requires-Dist: sse_starlette; extra == "perf"
64
64
  Requires-Dist: transformers; extra == "perf"
65
65
  Requires-Dist: unicorn; extra == "perf"
66
+ Provides-Extra: app
67
+ Requires-Dist: gradio>=5.4.0; extra == "app"
68
+ Requires-Dist: plotly>=5.23.0; extra == "app"
66
69
  Provides-Extra: inner
67
70
  Requires-Dist: absl-py; extra == "inner"
68
71
  Requires-Dist: accelerate; extra == "inner"
@@ -133,6 +136,8 @@ Requires-Dist: numpy; extra == "all"
133
136
  Requires-Dist: sse_starlette; extra == "all"
134
137
  Requires-Dist: transformers; extra == "all"
135
138
  Requires-Dist: unicorn; extra == "all"
139
+ Requires-Dist: gradio>=5.4.0; extra == "all"
140
+ Requires-Dist: plotly>=5.23.0; extra == "all"
136
141
 
137
142
  <p align="center">
138
143
  <br>
@@ -160,14 +165,16 @@ Requires-Dist: unicorn; extra == "all"
160
165
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
161
166
 
162
167
  ## 📋 Contents
163
- - [Introduction](#introduction)
164
- - [News](#News)
165
- - [Installation](#installation)
166
- - [Quick Start](#quick-start)
168
+ - [Introduction](#-introduction)
169
+ - [News](#-news)
170
+ - [Installation](#️-installation)
171
+ - [Quick Start](#-quick-start)
167
172
  - [Evaluation Backend](#evaluation-backend)
168
- - [Custom Dataset Evaluation](#custom-dataset-evaluation)
169
- - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
170
- - [Arena Mode](#arena-mode)
173
+ - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
174
+ - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
175
+ - [Arena Mode](#-arena-mode)
176
+ - [Contribution](#️-contribution)
177
+ - [Roadmap](#-roadmap)
171
178
 
172
179
 
173
180
  ## 📝 Introduction
@@ -208,11 +215,17 @@ Please scan the QR code below to join our community groups:
208
215
 
209
216
 
210
217
  ## 🎉 News
218
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
+ - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
+ - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
211
221
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
212
222
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
213
223
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
214
224
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
215
225
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
226
+
227
+ <details><summary>More</summary>
228
+
216
229
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
217
230
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
218
231
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -224,7 +237,7 @@ Please scan the QR code below to join our community groups:
224
237
  - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
225
238
  - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
226
239
 
227
-
240
+ </details>
228
241
 
229
242
  ## 🛠️ Installation
230
243
  ### Method 1: Install Using pip
@@ -368,15 +381,85 @@ run_task(task_cfg="config.json")
368
381
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
369
382
 
370
383
  ### Output Results
384
+ ```text
385
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
386
+ | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
387
+ +=======================+================+=================+=================+===============+=======+=========+
388
+ | Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
389
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
390
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
391
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
392
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
393
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
371
394
  ```
372
- +-----------------------+-------------------+-----------------+
373
- | Model | ai2_arc | gsm8k |
374
- +=======================+===================+=================+
375
- | Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
376
- +-----------------------+-------------------+-----------------+
395
+
396
+ ## 📈 Visualization of Evaluation Results
397
+
398
+ 1. Install the dependencies required for visualization, including gradio, plotly, etc.
399
+ ```bash
400
+ pip install 'evalscope[app]'
401
+ ```
402
+
403
+ 2. Start the Visualization Service
404
+
405
+ Run the following command to start the visualization service.
406
+ ```bash
407
+ evalscope app
377
408
  ```
409
+ You can access the visualization service in the browser if the following output appears.
410
+ ```text
411
+ * Running on local URL: http://127.0.0.1:7861
412
+
413
+ To create a public link, set `share=True` in `launch()`.
414
+ ```
415
+
416
+ <table>
417
+ <tr>
418
+ <td style="text-align: center;">
419
+ <img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
420
+ <p>Setting Interface</p>
421
+ </td>
422
+ <td style="text-align: center;">
423
+ <img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
424
+ <p>Model Comparison</p>
425
+ </td>
426
+ </tr>
427
+ <tr>
428
+ <td style="text-align: center;">
429
+ <img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
430
+ <p>Report Overview</p>
431
+ </td>
432
+ <td style="text-align: center;">
433
+ <img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
434
+ <p>Report Details</p>
435
+ </td>
436
+ </tr>
437
+ </table>
438
+
439
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
440
+
441
+ ## 🌐 Evaluation of Specified Model API
442
+
443
+ Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
444
+
445
+ For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
446
+
447
+ ```shell
448
+ export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
449
+ ```
450
+ Then, you can use the following command to evaluate the model API service:
451
+ ```shell
452
+ evalscope eval \
453
+ --model qwen2.5 \
454
+ --api-url http://127.0.0.1:8801/v1/chat/completions \
455
+ --api-key EMPTY \
456
+ --eval-type service \
457
+ --datasets gsm8k \
458
+ --limit 10
459
+ ```
460
+
461
+ ## ⚙️ Custom Parameter Evaluation
378
462
 
379
- ## ⚙️ Complex Evaluation
380
463
  For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
381
464
 
382
465
  ```shell
@@ -414,7 +497,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
414
497
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
415
498
 
416
499
 
417
- ## Model Serving Performance Evaluation
500
+ ## 📈 Model Serving Performance Evaluation
418
501
  A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
419
502
 
420
503
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -439,19 +522,32 @@ Speed Benchmark Results:
439
522
  +---------------+-----------------+----------------+
440
523
  ```
441
524
 
442
- ## Custom Dataset Evaluation
525
+ ## 🖊️ Custom Dataset Evaluation
443
526
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
444
527
 
445
528
 
446
- ## Arena Mode
529
+ ## 🏟️ Arena Mode
447
530
  The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
448
531
 
449
532
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
450
533
 
534
+ ## 👷‍♂️ Contribution
451
535
 
536
+ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
452
537
 
538
+ <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
539
+ <table>
540
+ <tr>
541
+ <th colspan="2">
542
+ <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
543
+ </th>
544
+ </tr>
545
+ </table>
546
+ </a>
453
547
 
454
- ## TO-DO List
548
+ ## 🔜 Roadmap
549
+ - [ ] Support for better evaluation report visualization
550
+ - [x] Support for mixed evaluations across multiple datasets
455
551
  - [x] RAG evaluation
456
552
  - [x] VLM evaluation
457
553
  - [x] Agents evaluation
@@ -462,8 +558,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
462
558
  - [ ] GAIA
463
559
  - [ ] GPQA
464
560
  - [x] MBPP
465
- - [ ] Auto-reviewer
466
- - [ ] Qwen-max
467
561
 
468
562
 
469
563
  ## Star History
@@ -24,14 +24,16 @@
24
24
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
25
25
 
26
26
  ## 📋 Contents
27
- - [Introduction](#introduction)
28
- - [News](#News)
29
- - [Installation](#installation)
30
- - [Quick Start](#quick-start)
27
+ - [Introduction](#-introduction)
28
+ - [News](#-news)
29
+ - [Installation](#️-installation)
30
+ - [Quick Start](#-quick-start)
31
31
  - [Evaluation Backend](#evaluation-backend)
32
- - [Custom Dataset Evaluation](#custom-dataset-evaluation)
33
- - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
34
- - [Arena Mode](#arena-mode)
32
+ - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
33
+ - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
34
+ - [Arena Mode](#-arena-mode)
35
+ - [Contribution](#️-contribution)
36
+ - [Roadmap](#-roadmap)
35
37
 
36
38
 
37
39
  ## 📝 Introduction
@@ -72,11 +74,17 @@ Please scan the QR code below to join our community groups:
72
74
 
73
75
 
74
76
  ## 🎉 News
77
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
78
+ - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
79
+ - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
75
80
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
76
81
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
77
82
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
78
83
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
79
84
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
85
+
86
+ <details><summary>More</summary>
87
+
80
88
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
81
89
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
82
90
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -88,7 +96,7 @@ Please scan the QR code below to join our community groups:
88
96
  - 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
89
97
  - 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
90
98
 
91
-
99
+ </details>
92
100
 
93
101
  ## 🛠️ Installation
94
102
  ### Method 1: Install Using pip
@@ -232,15 +240,85 @@ run_task(task_cfg="config.json")
232
240
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
233
241
 
234
242
  ### Output Results
243
+ ```text
244
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
245
+ | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
246
+ +=======================+================+=================+=================+===============+=======+=========+
247
+ | Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
248
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
249
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
250
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
251
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
252
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
235
253
  ```
236
- +-----------------------+-------------------+-----------------+
237
- | Model | ai2_arc | gsm8k |
238
- +=======================+===================+=================+
239
- | Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
240
- +-----------------------+-------------------+-----------------+
254
+
255
+ ## 📈 Visualization of Evaluation Results
256
+
257
+ 1. Install the dependencies required for visualization, including gradio, plotly, etc.
258
+ ```bash
259
+ pip install 'evalscope[app]'
260
+ ```
261
+
262
+ 2. Start the Visualization Service
263
+
264
+ Run the following command to start the visualization service.
265
+ ```bash
266
+ evalscope app
241
267
  ```
268
+ You can access the visualization service in the browser if the following output appears.
269
+ ```text
270
+ * Running on local URL: http://127.0.0.1:7861
271
+
272
+ To create a public link, set `share=True` in `launch()`.
273
+ ```
274
+
275
+ <table>
276
+ <tr>
277
+ <td style="text-align: center;">
278
+ <img src="docs/zh/get_started/images/setting.png" alt="Setting" style="width: 100%;" />
279
+ <p>Setting Interface</p>
280
+ </td>
281
+ <td style="text-align: center;">
282
+ <img src="docs/zh/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
283
+ <p>Model Comparison</p>
284
+ </td>
285
+ </tr>
286
+ <tr>
287
+ <td style="text-align: center;">
288
+ <img src="docs/zh/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
289
+ <p>Report Overview</p>
290
+ </td>
291
+ <td style="text-align: center;">
292
+ <img src="docs/zh/get_started/images/report_details.png" alt="Report Details" style="width: 100%;" />
293
+ <p>Report Details</p>
294
+ </td>
295
+ </tr>
296
+ </table>
297
+
298
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visulization.html)
299
+
300
+ ## 🌐 Evaluation of Specified Model API
301
+
302
+ Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
303
+
304
+ For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
305
+
306
+ ```shell
307
+ export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
308
+ ```
309
+ Then, you can use the following command to evaluate the model API service:
310
+ ```shell
311
+ evalscope eval \
312
+ --model qwen2.5 \
313
+ --api-url http://127.0.0.1:8801/v1/chat/completions \
314
+ --api-key EMPTY \
315
+ --eval-type service \
316
+ --datasets gsm8k \
317
+ --limit 10
318
+ ```
319
+
320
+ ## ⚙️ Custom Parameter Evaluation
242
321
 
243
- ## ⚙️ Complex Evaluation
244
322
  For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
245
323
 
246
324
  ```shell
@@ -278,7 +356,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
278
356
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
279
357
 
280
358
 
281
- ## Model Serving Performance Evaluation
359
+ ## 📈 Model Serving Performance Evaluation
282
360
  A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
283
361
 
284
362
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -303,19 +381,32 @@ Speed Benchmark Results:
303
381
  +---------------+-----------------+----------------+
304
382
  ```
305
383
 
306
- ## Custom Dataset Evaluation
384
+ ## 🖊️ Custom Dataset Evaluation
307
385
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
308
386
 
309
387
 
310
- ## Arena Mode
388
+ ## 🏟️ Arena Mode
311
389
  The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
312
390
 
313
391
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
314
392
 
393
+ ## 👷‍♂️ Contribution
315
394
 
395
+ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
316
396
 
397
+ <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
398
+ <table>
399
+ <tr>
400
+ <th colspan="2">
401
+ <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
402
+ </th>
403
+ </tr>
404
+ </table>
405
+ </a>
317
406
 
318
- ## TO-DO List
407
+ ## 🔜 Roadmap
408
+ - [ ] Support for better evaluation report visualization
409
+ - [x] Support for mixed evaluations across multiple datasets
319
410
  - [x] RAG evaluation
320
411
  - [x] VLM evaluation
321
412
  - [x] Agents evaluation
@@ -326,8 +417,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
326
417
  - [ ] GAIA
327
418
  - [ ] GPQA
328
419
  - [x] MBPP
329
- - [ ] Auto-reviewer
330
- - [ ] Qwen-max
331
420
 
332
421
 
333
422
  ## Star History
@@ -1,3 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ from evalscope.config import TaskConfig
4
+ from evalscope.run import run_task
3
5
  from .version import __release_datetime__, __version__
@@ -1,6 +1,8 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType
5
+
4
6
 
5
7
  class ParseStrArgsAction(argparse.Action):
6
8
 
@@ -31,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
31
33
  # yapf: disable
32
34
  # Model-related arguments
33
35
  parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
36
+ parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
34
37
  parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
35
38
 
36
39
  # Template-related arguments
@@ -47,10 +50,13 @@ def add_argument(parser: argparse.ArgumentParser):
47
50
  parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
48
51
 
49
52
  # Evaluation-related arguments
50
- parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
51
- parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
53
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
54
+ choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
55
+ parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
56
+ choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
52
57
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
53
- parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
58
+ parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
59
+ choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
54
60
  parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
55
61
 
56
62
  # Cache and working directory arguments
@@ -62,6 +68,8 @@ def add_argument(parser: argparse.ArgumentParser):
62
68
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
63
69
  parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
64
70
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
71
+ parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
72
+ parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
65
73
  # yapf: enable
66
74
 
67
75
 
@@ -3,7 +3,6 @@ Code adapated from https://github.com/mlfoundations/open_clip/blob/main/src/trai
3
3
  Thanks to the authors of OpenCLIP
4
4
  """
5
5
 
6
- import logging
7
6
  import torch
8
7
  import torch.nn.functional as F
9
8
  from contextlib import suppress
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter
10
10
 
11
11
 
12
12
  class LLM:
@@ -0,0 +1,23 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import glob
3
+ import importlib
4
+ import os
5
+
6
+ from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
7
+ from evalscope.benchmarks.data_adapter import DataAdapter
8
+ from evalscope.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+ # Using glob to find all files matching the pattern
13
+ pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
14
+ files = glob.glob(pattern, recursive=False)
15
+
16
+ for file_path in files:
17
+ if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
18
+ # Convert file path to a module path
19
+ relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
20
+ module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
21
+ full_path = f'evalscope.benchmarks.{module_path}'
22
+ importlib.import_module(full_path)
23
+ # print(f'Importing {full_path}')