evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,128 +0,0 @@
1
- import importlib
2
- from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, Optional
4
-
5
- from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType, ModelTask
6
- from evalscope.utils.logger import get_logger
7
- from evalscope.utils.model_utils import get_device
8
-
9
- if TYPE_CHECKING:
10
- from evalscope.config import TaskConfig
11
-
12
- logger = get_logger()
13
-
14
-
15
- class LocalModel(ABC):
16
-
17
- def __init__(self,
18
- model_id: str,
19
- model_revision: str = None,
20
- device_map: str = None,
21
- torch_dtype: str = 'auto',
22
- cache_dir: str = None,
23
- **kwargs):
24
-
25
- self.model_id = model_id
26
- self.model_revision = model_revision or DEFAULT_MODEL_REVISION
27
- self.device = device_map or get_device()
28
- self.cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
29
- self.kwargs = kwargs
30
- self.model = None
31
- self.tokenizer = None
32
-
33
- if isinstance(torch_dtype, str) and torch_dtype != 'auto':
34
- import torch
35
- torch_dtype = eval(torch_dtype)
36
- self.torch_dtype = torch_dtype
37
-
38
- self.model_cfg = {
39
- 'model_id': self.model_id,
40
- 'device_map': self.device,
41
- 'torch_dtype': str(self.torch_dtype),
42
- }
43
-
44
- @abstractmethod
45
- def load_model(self):
46
- pass
47
-
48
-
49
- class LocalChatModel(LocalModel):
50
-
51
- def __init__(self, **kwargs):
52
- super().__init__(**kwargs)
53
-
54
- def load_model(self):
55
- from modelscope import AutoModelForCausalLM, AutoTokenizer
56
-
57
- logger.info(f'Loading model {self.model_id} ...')
58
-
59
- self.tokenizer = AutoTokenizer.from_pretrained(
60
- self.model_id,
61
- revision=self.model_revision,
62
- trust_remote_code=True,
63
- cache_dir=self.cache_dir,
64
- )
65
-
66
- # Fix no padding
67
- if self.tokenizer.pad_token is None:
68
- self.tokenizer.pad_token = self.tokenizer.eos_token
69
-
70
- self.model = AutoModelForCausalLM.from_pretrained(
71
- self.model_id,
72
- revision=self.model_revision,
73
- device_map=self.device,
74
- trust_remote_code=True,
75
- torch_dtype=self.torch_dtype,
76
- cache_dir=self.cache_dir,
77
- )
78
-
79
-
80
- class LocalImageModel(LocalModel):
81
-
82
- def __init__(self, **kwargs):
83
- super().__init__(**kwargs)
84
-
85
- self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
86
- # default to DiffusionPipeline if not specified
87
- if self.pipeline_cls is None:
88
- if 'flux' in self.model_id.lower():
89
- self.pipeline_cls = 'FluxPipeline'
90
- else:
91
- self.pipeline_cls = 'DiffusionPipeline'
92
-
93
- def load_model(self):
94
- # from modelscope import pipeline_cls
95
- module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
96
-
97
- logger.info(f'Loading model {self.model_id} with {self.pipeline_cls} ...')
98
-
99
- self.model = module.from_pretrained(
100
- self.model_id,
101
- revision=self.model_revision,
102
- torch_dtype=self.torch_dtype,
103
- cache_dir=self.cache_dir,
104
- **self.kwargs,
105
- )
106
-
107
- self.model.to(self.device)
108
-
109
- def __call__(self, *args, **kwargs):
110
- return self.model(*args, **kwargs)
111
-
112
-
113
- def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
114
- """Get the base local model for the task. If the task is not checkpoint-based, return None.
115
- Avoids loading model multiple times for different datasets.
116
- """
117
- if task_cfg.eval_type != EvalType.CHECKPOINT:
118
- return None
119
- elif task_cfg.model_task == ModelTask.TEXT_GENERATION:
120
- base_model = LocalChatModel(model_id=task_cfg.model, **task_cfg.model_args)
121
- base_model.load_model()
122
- return base_model
123
- elif task_cfg.model_task == ModelTask.IMAGE_GENERATION:
124
- base_model = LocalImageModel(model_id=task_cfg.model, **task_cfg.model_args)
125
- base_model.load_model()
126
- return base_model
127
- else:
128
- raise ValueError(f'Unsupported model task: {task_cfg.model_task} for model checkpoint.')
@@ -1,41 +0,0 @@
1
- MODEL_ADAPTERS = {}
2
-
3
-
4
- def register_model_adapter(name):
5
- """
6
- Decorator to register a model adapter with a given name.
7
- :param name: The name of the model adapter.
8
- """
9
-
10
- def decorator(adapter):
11
- if name in MODEL_ADAPTERS:
12
- raise ValueError(f"Model adapter '{name}' is already registered.")
13
- MODEL_ADAPTERS[name] = adapter
14
- return adapter
15
-
16
- return decorator
17
-
18
-
19
- def get_model_adapter(name):
20
- """
21
- Retrieve a registered model adapter by name.
22
- :param name: The name of the model adapter.
23
- :return: The model adapter class or function.
24
- """
25
- if name not in MODEL_ADAPTERS:
26
- raise ValueError(
27
- f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
28
- return MODEL_ADAPTERS[name]
29
-
30
-
31
- def register_model_adapter_class(cls, name=None):
32
- """
33
- Register a model adapter class.
34
- :param cls: The model adapter class to register
35
- :param name: Optional name for the model adapter. If not provided, the class name will be used.
36
- """
37
- if name is None:
38
- name = cls.__name__
39
- if name in MODEL_ADAPTERS:
40
- raise ValueError(f"Model adapter class '{name}' is already registered.")
41
- MODEL_ADAPTERS[name] = cls
tests/cli/test_run.py DELETED
@@ -1,489 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- from tests.utils import test_level_list
5
-
6
- env = dotenv_values('.env')
7
-
8
- import os
9
- import subprocess
10
- import unittest
11
-
12
- from evalscope.config import TaskConfig
13
- from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
14
- from evalscope.run import run_task
15
- from evalscope.utils.import_utils import is_module_installed
16
- from evalscope.utils.logger import get_logger
17
-
18
- os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
19
-
20
- logger = get_logger()
21
-
22
-
23
- class TestRun(unittest.TestCase):
24
-
25
- def setUp(self) -> None:
26
- logger.info('Init env for evalscope native run UTs ...\n')
27
- self._check_env('evalscope')
28
-
29
- def tearDown(self) -> None:
30
- pass
31
-
32
- @staticmethod
33
- def _check_env(module_name: str):
34
- if is_module_installed(module_name):
35
- logger.info(f'{module_name} is installed.')
36
- else:
37
- raise ModuleNotFoundError(f'run: pip install {module_name}')
38
-
39
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
40
- def test_run_simple_eval(self):
41
- model = 'qwen/Qwen2-0.5B-Instruct'
42
- datasets = 'arc' # arc ceval
43
- limit = 10
44
-
45
- cmd_simple = f'evalscope eval ' \
46
- f'--model {model} ' \
47
- f'--datasets {datasets} ' \
48
- f'--limit {limit}'
49
-
50
- logger.info(f'Start to run command: {cmd_simple}')
51
- run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
52
-
53
- assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
54
- logger.info(f'>>test_run_simple_eval stdout: {run_res.stdout}')
55
- logger.error(f'>>test_run_simple_eval stderr: {run_res.stderr}')
56
-
57
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
58
- def test_run_eval_with_args(self):
59
- model = 'qwen/Qwen2-0.5B-Instruct'
60
- datasets = 'arc' # arc ceval
61
- limit = 5
62
- dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'
63
-
64
- cmd_with_args = f'evalscope eval ' \
65
- f'--model {model} ' \
66
- f'--datasets {datasets} ' \
67
- f'--limit {limit} ' \
68
- f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
69
- f"""--dataset-args \'{dataset_args}\' """
70
-
71
- logger.info(f'Start to run command: {cmd_with_args}')
72
- run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
73
-
74
- assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'
75
- logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
76
- logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
77
-
78
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
79
- def test_run_yaml_config(self):
80
- from evalscope import run_task
81
-
82
- run_task(task_cfg='examples/tasks/eval_native.yaml')
83
-
84
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
85
- def test_run_task(self):
86
- task_cfg = TaskConfig(
87
- model='qwen/Qwen2.5-0.5B-Instruct',
88
- datasets=[
89
- 'iquiz',
90
- # 'ifeval',
91
- # 'mmlu',
92
- # 'mmlu_pro',
93
- # 'musr',
94
- # 'process_bench',
95
- # 'race',
96
- # 'trivia_qa',
97
- # 'cmmlu',
98
- # 'humaneval',
99
- # 'super_gpqa',
100
- # 'gsm8k',
101
- # 'bbh',
102
- # 'competition_math',
103
- # 'math_500',
104
- 'aime24',
105
- 'gpqa',
106
- # 'arc',
107
- # 'ceval',
108
- # 'hellaswag',
109
- # 'general_mcq',
110
- # 'general_qa'
111
- ],
112
- dataset_args={
113
- 'mmlu': {
114
- 'subset_list': ['elementary_mathematics'],
115
- 'few_shot_num': 0
116
- },
117
- 'mmlu_pro': {
118
- 'subset_list': ['math', 'health'],
119
- 'few_shot_num': 4
120
- },
121
- 'ceval': {
122
- 'subset_list': [
123
- 'computer_network', 'operating_system', 'computer_architecture'
124
- ],
125
- 'few_shot_num': 0
126
- },
127
- 'cmmlu': {
128
- 'subset_list': ['elementary_chinese'],
129
- 'few_shot_num': 0
130
- },
131
- 'bbh': {
132
- 'subset_list': ['word_sorting', 'movie_recommendation'],
133
- },
134
- 'gpqa': {
135
- 'subset_list': ['gpqa_diamond'],
136
- 'few_shot_num': 0
137
- },
138
- 'humaneval': {
139
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
140
- },
141
- 'competition_math': {
142
- 'subset_list': ['Level 1']
143
- },
144
- 'process_bench': {
145
- 'subset_list': ['gsm8k'],
146
- },
147
- 'musr': {
148
- 'subset_list': ['murder_mysteries'],
149
- },
150
- 'general_mcq': {
151
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
152
- 'subset_list': [
153
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
154
- ],
155
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
156
- },
157
- 'general_qa': {
158
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
159
- 'subset_list': [
160
- 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
161
- # 'test'
162
- ],
163
- 'metric_list': ['AverageBLEU']
164
- },
165
- 'super_gpqa': {
166
- 'subset_list': ['Philosophy', 'Education'],
167
- 'few_shot_num': 0
168
- },
169
- 'ifeval': {
170
- 'filters': {
171
- 'remove_until': '</think>'
172
- }
173
- }
174
- },
175
- limit=2,
176
- eval_batch_size=2,
177
- generation_config={
178
- 'max_new_tokens': 2048,
179
- 'temperature': 0.7,
180
- 'num_return_sequences': 1,
181
- },
182
- # debug=True
183
- )
184
- run_task(task_cfg=task_cfg)
185
-
186
-
187
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
188
- def test_run_one_task(self):
189
- from evalscope.config import TaskConfig
190
-
191
- task_cfg = TaskConfig(
192
- model='Qwen/Qwen3-1.7B',
193
- datasets=[
194
- # 'iquiz',
195
- # 'math_500',
196
- # 'aime24',
197
- # 'competition_math',
198
- # 'mmlu',
199
- # 'simple_qa',
200
- 'truthful_qa',
201
- ],
202
- dataset_args={
203
- 'competition_math': {
204
- 'subset_list': ['Level 4', 'Level 5']
205
- },
206
- 'mmlu': {
207
- 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
208
- 'few_shot_num': 0
209
- },
210
- },
211
- limit=5,
212
- eval_batch_size=5,
213
- generation_config={
214
- 'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
215
- 'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
216
- 'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
217
- 'top_k': 20, # top-k采样 (qwen 报告推荐值)
218
- 'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
219
- },
220
- judge_strategy=JudgeStrategy.AUTO,
221
- )
222
-
223
- run_task(task_cfg=task_cfg)
224
-
225
-
226
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
227
- def test_run_task_loop(self):
228
- os.environ['CUDA_VISIBLE_DEVICES'] = '2'
229
- from evalscope.config import TaskConfig
230
-
231
- task_cfg1 = TaskConfig(
232
- model='Qwen/Qwen2.5-0.5B-Instruct',
233
- model_id='model1',
234
- datasets=['iquiz'],
235
- limit=10
236
- )
237
- task_cfg2 = TaskConfig(
238
- model='Qwen/Qwen2.5-0.5B-Instruct',
239
- model_id='model2',
240
- datasets=['iquiz'],
241
- limit=10
242
- )
243
- task_cfg3 = TaskConfig(
244
- model='Qwen/Qwen2.5-0.5B-Instruct',
245
- model_id='model3',
246
- datasets=['iquiz'],
247
- limit=10
248
- )
249
-
250
- run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
251
-
252
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
253
- def test_run_server_model(self):
254
- from evalscope.config import TaskConfig
255
-
256
- task_cfg = TaskConfig(
257
- model='qwen-plus',
258
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
259
- api_key= env.get('DASHSCOPE_API_KEY'),
260
- eval_type=EvalType.SERVICE,
261
- datasets=[
262
- # 'iquiz',
263
- # 'ifeval',
264
- # 'mmlu',
265
- # 'mmlu_pro',
266
- # 'musr',
267
- # 'process_bench',
268
- # 'race',
269
- 'trivia_qa',
270
- # 'cmmlu',
271
- # 'humaneval',
272
- # 'gsm8k',
273
- # 'bbh',
274
- # 'competition_math',
275
- # 'math_500',
276
- # 'aime24',
277
- # 'gpqa',
278
- # 'arc',
279
- # 'ceval',
280
- # 'hellaswag',
281
- # 'general_mcq',
282
- # 'general_qa',
283
- # 'super_gpqa',
284
- # 'mmlu_redux',
285
- # 'maritime_bench',
286
- # 'drop',
287
- # 'winogrande',
288
- # 'tool_bench',
289
- # 'frames',
290
- # 'bfcl_v3',
291
- # 'truthful_qa',
292
- # 'tau_bench',
293
- # 'hle'
294
- ],
295
- dataset_args={
296
- 'mmlu': {
297
- 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
298
- 'few_shot_num': 0
299
- },
300
- 'mmlu_pro': {
301
- 'subset_list': ['math', 'health'],
302
- 'few_shot_num': 0
303
- },
304
- 'ceval': {
305
- 'subset_list': [
306
- 'computer_network', 'operating_system', 'computer_architecture'
307
- ],
308
- 'few_shot_num': 0
309
- },
310
- 'cmmlu': {
311
- 'subset_list': ['elementary_chinese'],
312
- 'few_shot_num': 0
313
- },
314
- 'bbh': {
315
- 'subset_list': ['word_sorting', 'movie_recommendation'],
316
- },
317
- 'gpqa': {
318
- # 'subset_list': ['gpqa_diamond'],
319
- 'few_shot_num': 0,
320
- 'local_path': './data/data/gpqa',
321
- },
322
- 'humaneval': {
323
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
324
- },
325
- 'competition_math': {
326
- 'subset_list': ['Level 1']
327
- },
328
- 'process_bench': {
329
- 'subset_list': ['gsm8k'],
330
- },
331
- 'musr': {
332
- 'subset_list': ['murder_mysteries'],
333
- },
334
- 'general_mcq': {
335
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
336
- 'subset_list': [
337
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
338
- ],
339
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
340
- },
341
- 'general_qa': {
342
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
343
- 'subset_list': [
344
- 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
345
- # 'test'
346
- ],
347
- 'metric_list': ['AverageRouge']
348
- },
349
- 'super_gpqa': {
350
- 'subset_list': ['Philosophy', 'Education'],
351
- 'few_shot_num': 0
352
- },
353
- 'mmlu_redux':{
354
- 'subset_list': ['abstract_algebra']
355
- },
356
- 'frames':{
357
- 'local_path': 'data/iic/frames',
358
- },
359
- 'bfcl_v3': {
360
- 'subset_list': ['parallel'],
361
- 'extra_params': {
362
- # 'is_fc_model': False,
363
- }
364
- },
365
- 'tau_bench': {
366
- 'extra_params': {
367
- 'user_model': 'qwen-plus',
368
- 'api_key': env.get('DASHSCOPE_API_KEY'),
369
- 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
370
- }
371
- },
372
- 'hle': {
373
- 'subset_list': ['Math', 'Other'],
374
- },
375
- },
376
- eval_batch_size=10,
377
- limit=10,
378
- # debug=True,
379
- stream=True,
380
- generation_config={
381
- 'temperature': 0.6,
382
- 'n': 1,
383
- 'max_tokens': 4096,
384
- # 'extra_headers':{'key': 'value'},
385
- },
386
- ignore_errors=False,
387
- )
388
-
389
- run_task(task_cfg=task_cfg)
390
-
391
-
392
-
393
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
394
- def test_run_judge_model(self):
395
- from evalscope.config import TaskConfig
396
-
397
- task_cfg = TaskConfig(
398
- model='qwen-plus',
399
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
400
- api_key= env.get('DASHSCOPE_API_KEY'),
401
- eval_type=EvalType.SERVICE,
402
- datasets=[
403
- # 'math_500',
404
- # 'aime24',
405
- # 'competition_math',
406
- # 'arc',
407
- # 'gsm8k',
408
- # 'truthful_qa',
409
- # 'simple_qa',
410
- # 'chinese_simpleqa',
411
- # 'live_code_bench',
412
- # 'humaneval',
413
- # 'general_qa',
414
- # 'alpaca_eval',
415
- # 'arena_hard',
416
- # 'frames',
417
- # 'docmath',
418
- # 'needle_haystack',
419
- # 'ifeval',
420
- 'hle'
421
- ],
422
- dataset_args={
423
- 'needle_haystack': {
424
- 'subset_list': ['english'],
425
- 'extra_params': {
426
- 'show_score': True,
427
- }
428
- },
429
- 'competition_math': {
430
- 'subset_list': ['Level 4']
431
- },
432
- 'live_code_bench': {
433
- 'extra_params': {
434
- 'start_date': '2024-08-01',
435
- 'end_date': '2025-02-28'
436
- },
437
- 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
438
- },
439
- 'general_qa': {
440
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
441
- 'subset_list': [
442
- 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
443
- # 'test'
444
- ]
445
- },
446
- 'chinese_simpleqa': {
447
- 'subset_list': [
448
- '中华文化'
449
- ]
450
- },
451
- 'frames': {
452
- 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
453
- },
454
- 'hle': {
455
- 'subset_list': ['Math', 'Other'],
456
- },
457
- },
458
- eval_batch_size=10,
459
- limit=3,
460
- judge_strategy=JudgeStrategy.LLM,
461
- judge_worker_num=5,
462
- judge_model_args={
463
- 'model_id': 'qwen2.5-72b-instruct',
464
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
465
- 'api_key': env.get('DASHSCOPE_API_KEY'),
466
- 'generation_config': {
467
- 'temperature': 0.0,
468
- 'max_tokens': 4096
469
- }
470
- },
471
- generation_config={
472
- 'max_new_tokens': 20000,
473
- 'temperature': 0.0,
474
- 'seed': 42,
475
- 'n': 1
476
- },
477
- timeout=60000,
478
- stream=True,
479
- use_cache='outputs/20250714_150626'
480
- # analysis_report=True,
481
- # debug=True,
482
- # use_cache='outputs/20250616_161931'
483
- )
484
-
485
- run_task(task_cfg=task_cfg)
486
-
487
-
488
- if __name__ == '__main__':
489
- unittest.main()
File without changes
File without changes