evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.17.1'
4
- __release_datetime__ = '2025-07-18 17:00:00'
3
+ __version__ = '1.0.1'
4
+ __release_datetime__ = '2025-09-05 14:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.17.1
3
+ Version: 1.0.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -17,9 +17,10 @@ Classifier: Programming Language :: Python :: 3.12
17
17
  Requires-Python: >=3.9
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
- Requires-Dist: accelerate
21
- Requires-Dist: datasets==3.2.0
22
- Requires-Dist: immutabledict
20
+ Requires-Dist: colorlog
21
+ Requires-Dist: datasets==3.6.0
22
+ Requires-Dist: docstring-parser
23
+ Requires-Dist: dotenv
23
24
  Requires-Dist: jieba
24
25
  Requires-Dist: jsonlines
25
26
  Requires-Dist: langdetect
@@ -28,11 +29,13 @@ Requires-Dist: matplotlib
28
29
  Requires-Dist: modelscope[framework]>=1.27
29
30
  Requires-Dist: nltk>=3.9
30
31
  Requires-Dist: openai
32
+ Requires-Dist: overrides
31
33
  Requires-Dist: pandas
32
34
  Requires-Dist: pillow
33
- Requires-Dist: pyarrow
35
+ Requires-Dist: pydantic
34
36
  Requires-Dist: pyyaml>=5.1
35
37
  Requires-Dist: requests
38
+ Requires-Dist: rich
36
39
  Requires-Dist: rouge-chinese
37
40
  Requires-Dist: rouge-score>=0.1.0
38
41
  Requires-Dist: sacrebleu
@@ -40,7 +43,6 @@ Requires-Dist: scikit-learn
40
43
  Requires-Dist: seaborn
41
44
  Requires-Dist: sympy
42
45
  Requires-Dist: tabulate
43
- Requires-Dist: torch
44
46
  Requires-Dist: tqdm
45
47
  Requires-Dist: transformers>=4.33
46
48
  Requires-Dist: word2number
@@ -50,11 +52,14 @@ Requires-Dist: iopath; extra == "aigc"
50
52
  Requires-Dist: omegaconf; extra == "aigc"
51
53
  Requires-Dist: open-clip-torch; extra == "aigc"
52
54
  Requires-Dist: opencv-python; extra == "aigc"
55
+ Requires-Dist: peft>=0.17; extra == "aigc"
56
+ Requires-Dist: torch; extra == "aigc"
53
57
  Requires-Dist: torchvision; extra == "aigc"
54
58
  Provides-Extra: all
55
- Requires-Dist: accelerate; extra == "all"
56
- Requires-Dist: datasets==3.2.0; extra == "all"
57
- Requires-Dist: immutabledict; extra == "all"
59
+ Requires-Dist: colorlog; extra == "all"
60
+ Requires-Dist: datasets==3.6.0; extra == "all"
61
+ Requires-Dist: docstring-parser; extra == "all"
62
+ Requires-Dist: dotenv; extra == "all"
58
63
  Requires-Dist: jieba; extra == "all"
59
64
  Requires-Dist: jsonlines; extra == "all"
60
65
  Requires-Dist: langdetect; extra == "all"
@@ -63,11 +68,13 @@ Requires-Dist: matplotlib; extra == "all"
63
68
  Requires-Dist: modelscope[framework]>=1.27; extra == "all"
64
69
  Requires-Dist: nltk>=3.9; extra == "all"
65
70
  Requires-Dist: openai; extra == "all"
71
+ Requires-Dist: overrides; extra == "all"
66
72
  Requires-Dist: pandas; extra == "all"
67
73
  Requires-Dist: pillow; extra == "all"
68
- Requires-Dist: pyarrow; extra == "all"
74
+ Requires-Dist: pydantic; extra == "all"
69
75
  Requires-Dist: pyyaml>=5.1; extra == "all"
70
76
  Requires-Dist: requests; extra == "all"
77
+ Requires-Dist: rich; extra == "all"
71
78
  Requires-Dist: rouge-chinese; extra == "all"
72
79
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
73
80
  Requires-Dist: sacrebleu; extra == "all"
@@ -75,7 +82,6 @@ Requires-Dist: scikit-learn; extra == "all"
75
82
  Requires-Dist: seaborn; extra == "all"
76
83
  Requires-Dist: sympy; extra == "all"
77
84
  Requires-Dist: tabulate; extra == "all"
78
- Requires-Dist: torch; extra == "all"
79
85
  Requires-Dist: tqdm; extra == "all"
80
86
  Requires-Dist: transformers>=4.33; extra == "all"
81
87
  Requires-Dist: word2number; extra == "all"
@@ -87,11 +93,11 @@ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
87
93
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
88
94
  Requires-Dist: mteb==1.38.20; extra == "all"
89
95
  Requires-Dist: ragas==0.2.14; extra == "all"
96
+ Requires-Dist: torch; extra == "all"
90
97
  Requires-Dist: webdataset>0.2.0; extra == "all"
91
98
  Requires-Dist: aiohttp; extra == "all"
92
99
  Requires-Dist: fastapi; extra == "all"
93
100
  Requires-Dist: numpy; extra == "all"
94
- Requires-Dist: rich; extra == "all"
95
101
  Requires-Dist: sse-starlette; extra == "all"
96
102
  Requires-Dist: transformers; extra == "all"
97
103
  Requires-Dist: uvicorn; extra == "all"
@@ -102,8 +108,9 @@ Requires-Dist: iopath; extra == "all"
102
108
  Requires-Dist: omegaconf; extra == "all"
103
109
  Requires-Dist: open-clip-torch; extra == "all"
104
110
  Requires-Dist: opencv-python; extra == "all"
111
+ Requires-Dist: peft>=0.17; extra == "all"
105
112
  Requires-Dist: torchvision; extra == "all"
106
- Requires-Dist: bfcl-eval; extra == "all"
113
+ Requires-Dist: bfcl-eval==2025.6.16; extra == "all"
107
114
  Requires-Dist: human-eval; extra == "all"
108
115
  Requires-Dist: pytest; extra == "all"
109
116
  Requires-Dist: pytest-cov; extra == "all"
@@ -112,7 +119,7 @@ Provides-Extra: app
112
119
  Requires-Dist: gradio==5.4.0; extra == "app"
113
120
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
114
121
  Provides-Extra: dev
115
- Requires-Dist: bfcl-eval; extra == "dev"
122
+ Requires-Dist: bfcl-eval==2025.6.16; extra == "dev"
116
123
  Requires-Dist: human-eval; extra == "dev"
117
124
  Requires-Dist: pytest; extra == "dev"
118
125
  Requires-Dist: pytest-cov; extra == "dev"
@@ -141,6 +148,7 @@ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
141
148
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
142
149
  Requires-Dist: mteb==1.38.20; extra == "rag"
143
150
  Requires-Dist: ragas==0.2.14; extra == "rag"
151
+ Requires-Dist: torch; extra == "rag"
144
152
  Requires-Dist: webdataset>0.2.0; extra == "rag"
145
153
  Provides-Extra: vlmeval
146
154
  Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
@@ -175,9 +183,9 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
175
183
  - [📝 Introduction](#-introduction)
176
184
  - [☎ User Groups](#-user-groups)
177
185
  - [🎉 News](#-news)
178
- - [🛠️ Installation](#️-installation)
179
- - [Method 1: Install Using pip](#method-1-install-using-pip)
180
- - [Method 2: Install from Source](#method-2-install-from-source)
186
+ - [🛠️ Environment Setup](#️-environment-setup)
187
+ - [Method 1. Install via pip](#method-1-install-via-pip)
188
+ - [Method 2. Install from source](#method-2-install-from-source)
181
189
  - [🚀 Quick Start](#-quick-start)
182
190
  - [Method 1. Using Command Line](#method-1-using-command-line)
183
191
  - [Method 2. Using Python Code](#method-2-using-python-code)
@@ -258,6 +266,15 @@ Please scan the QR code below to join our community groups:
258
266
 
259
267
 
260
268
  ## 🎉 News
269
+
270
+ > [!IMPORTANT]
271
+ > **Version 1.0 Refactoring**
272
+ >
273
+ > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
274
+
275
+ - 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
276
+ - 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
277
+ - 🔥 **[2025.08.22]** Version 1.0 Refactoring. Break changes, please [refer to](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#switching-to-version-v1-0).
261
278
  - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
262
279
  - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
263
280
  - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
@@ -265,16 +282,16 @@ Please scan the QR code below to join our community groups:
265
282
  - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
266
283
  - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
267
284
  - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
268
- - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
285
+ - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/index.html).
269
286
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
270
287
  - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
288
+ <details><summary>More</summary>
289
+
271
290
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
272
291
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
273
292
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
274
293
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
275
- <details><summary>More</summary>
276
-
277
- - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
294
+ - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/index.html)
278
295
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
279
296
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
280
297
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
@@ -306,58 +323,87 @@ Please scan the QR code below to join our community groups:
306
323
 
307
324
  </details>
308
325
 
309
- ## 🛠️ Installation
310
- ### Method 1: Install Using pip
311
- We recommend using conda to manage your environment and installing dependencies with pip:
326
+ ## 🛠️ Environment Setup
327
+
328
+ ### Method 1. Install via pip
329
+
330
+ We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
312
331
 
313
332
  1. Create a conda environment (optional)
333
+ ```shell
334
+ # Python 3.10 is recommended
335
+ conda create -n evalscope python=3.10
336
+
337
+ # Activate the conda environment
338
+ conda activate evalscope
339
+ ```
340
+ 2. Install dependencies via pip
341
+ ```shell
342
+ pip install evalscope
343
+ ```
344
+ 3. Install additional dependencies (optional)
345
+ - To use model service inference benchmarking features, install the perf dependency:
314
346
  ```shell
315
- # It is recommended to use Python 3.10
316
- conda create -n evalscope python=3.10
317
- # Activate the conda environment
318
- conda activate evalscope
347
+ pip install 'evalscope[perf]'
319
348
  ```
320
-
321
- 2. Install dependencies using pip
349
+ - To use visualization features, install the app dependency:
350
+ ```shell
351
+ pip install 'evalscope[app]'
352
+ ```
353
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
322
354
  ```shell
323
- pip install evalscope # Install Native backend (default)
324
- # Additional options
325
- pip install 'evalscope[opencompass]' # Install OpenCompass backend
326
- pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
327
- pip install 'evalscope[rag]' # Install RAGEval backend
328
- pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
329
- pip install 'evalscope[app]' # Install dependencies for visualization
330
- pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
355
+ pip install 'evalscope[opencompass]'
356
+ pip install 'evalscope[vlmeval]'
357
+ pip install 'evalscope[rag]'
358
+ ```
359
+ - To install all dependencies:
360
+ ```shell
361
+ pip install 'evalscope[all]'
331
362
  ```
332
363
 
333
- > [!WARNING]
334
- > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
364
+ > [!NOTE]
365
+ > The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
335
366
  > ```shell
336
- > pip install llmuses<=0.4.3
367
+ > pip install llmuses<=0.4.3
337
368
  > ```
338
- > To import relevant dependencies using `llmuses`:
339
- > ``` python
369
+ > Then, import related dependencies using `llmuses`:
370
+ > ```python
340
371
  > from llmuses import ...
341
372
  > ```
342
373
 
343
- ### Method 2: Install from Source
344
- 1. Download the source code
345
- ```shell
346
- git clone https://github.com/modelscope/evalscope.git
347
- ```
374
+ ### Method 2. Install from source
348
375
 
376
+ Installing from source allows you to use the latest code and makes it easier for further development and debugging.
377
+
378
+ 1. Clone the source code
379
+ ```shell
380
+ git clone https://github.com/modelscope/evalscope.git
381
+ ```
349
382
  2. Install dependencies
350
- ```shell
351
- cd evalscope/
352
- pip install -e . # Install Native backend
353
- # Additional options
354
- pip install -e '.[opencompass]' # Install OpenCompass backend
355
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
356
- pip install -e '.[rag]' # Install RAGEval backend
357
- pip install -e '.[perf]' # Install Perf dependencies
358
- pip install -e '.[app]' # Install visualization dependencies
359
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
360
- ```
383
+ ```shell
384
+ cd evalscope/
385
+
386
+ pip install -e .
387
+ ```
388
+ 3. Install additional dependencies
389
+ - To use model service inference benchmarking features, install the perf dependency:
390
+ ```shell
391
+ pip install '.[perf]'
392
+ ```
393
+ - To use visualization features, install the app dependency:
394
+ ```shell
395
+ pip install '.[app]'
396
+ ```
397
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
398
+ ```shell
399
+ pip install '.[opencompass]'
400
+ pip install '.[vlmeval]'
401
+ pip install '.[rag]'
402
+ ```
403
+ - To install all dependencies:
404
+ ```shell
405
+ pip install '.[all]'
406
+ ```
361
407
 
362
408
 
363
409
  ## 🚀 Quick Start
@@ -444,7 +490,7 @@ run_task(task_cfg="config.json")
444
490
 
445
491
  ### Basic Parameter
446
492
  - `--model`: Specifies the `model_id` of the model in [ModelScope](https://modelscope.cn/), which can be automatically downloaded, e.g., [Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct/summary); or use the local path of the model, e.g., `/path/to/model`
447
- - `--datasets`: Dataset names, supports inputting multiple datasets separated by spaces. Datasets will be automatically downloaded from modelscope. For supported datasets, refer to the [Dataset List](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
493
+ - `--datasets`: Dataset names, supports inputting multiple datasets separated by spaces. Datasets will be automatically downloaded from modelscope. For supported datasets, refer to the [Dataset List](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/index.html)
448
494
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
449
495
 
450
496
  ### Output Results
@@ -533,7 +579,7 @@ For more customized evaluations, such as customizing model parameters or dataset
533
579
  evalscope eval \
534
580
  --model Qwen/Qwen3-0.6B \
535
581
  --model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
536
- --generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
582
+ --generation-config '{"do_sample":true,"temperature":0.6,"max_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
537
583
  --dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
538
584
  --datasets gsm8k \
539
585
  --limit 10
@@ -547,7 +593,7 @@ evalscope eval \
547
593
  - `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
548
594
  - `do_sample`: Whether to use sampling
549
595
  - `temperature`: Generation temperature
550
- - `max_new_tokens`: Maximum length of generated tokens
596
+ - `max_tokens`: Maximum length of generated tokens
551
597
  - `chat_template_kwargs`: Model inference template parameters
552
598
  - `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
553
599
  - `few_shot_num`: Number of few-shot examples