evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
evalscope/run.py CHANGED
@@ -13,9 +13,6 @@ from evalscope.utils.io_utils import OutputsStructure
13
13
  from evalscope.utils.logger import configure_logging, get_logger
14
14
  from evalscope.utils.model_utils import seed_everything
15
15
 
16
- if TYPE_CHECKING:
17
- from evalscope.models import LocalModel
18
-
19
16
  logger = get_logger()
20
17
 
21
18
 
@@ -109,27 +106,43 @@ def get_backend_manager_class(eval_backend: EvalBackend):
109
106
  raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
110
107
 
111
108
 
112
- def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
109
+ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
113
110
  """Evaluate the model based on the provided task configuration."""
114
- from evalscope.models import get_local_model
111
+ from evalscope.api.evaluator import Evaluator
112
+ from evalscope.api.model import get_model_with_task_config
113
+ from evalscope.api.registry import get_benchmark
114
+ from evalscope.evaluator import DefaultEvaluator
115
115
  from evalscope.report import gen_table
116
116
 
117
117
  # Initialize evaluator
118
118
  eval_results = {}
119
- base_model = get_local_model(task_cfg)
120
- evaluators = []
121
- for dataset_name in task_cfg.datasets:
122
- evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
119
+ # Initialize model
120
+ model = get_model_with_task_config(task_config=task_config)
121
+ # Initialize evaluators for each dataset
122
+ evaluators: List[Evaluator] = []
123
+ for dataset_name in task_config.datasets:
124
+ # Create evaluator for each dataset
125
+ benchmark = get_benchmark(dataset_name, task_config)
126
+ evaluator = DefaultEvaluator(
127
+ task_config=task_config,
128
+ model=model,
129
+ benchmark=benchmark,
130
+ outputs=outputs,
131
+ )
123
132
  evaluators.append(evaluator)
124
133
 
134
+ # Update task_config.dataset_args with benchmark metadata, except for DataCollection
135
+ if dataset_name != DataCollection.NAME:
136
+ task_config.dataset_args[dataset_name] = benchmark.to_dict()
137
+
125
138
  # dump task_cfg to outputs.configs_dir after creating evaluators
126
- task_cfg.dump_yaml(outputs.configs_dir)
127
- logger.info(task_cfg)
139
+ task_config.dump_yaml(outputs.configs_dir)
140
+ logger.info(task_config)
128
141
 
129
142
  # Run evaluation for each evaluator
130
143
  for evaluator in evaluators:
131
144
  res_dict = evaluator.eval()
132
- eval_results[evaluator.dataset_name] = res_dict
145
+ eval_results[evaluator.benchmark.name] = res_dict
133
146
 
134
147
  # Make overall report
135
148
  try:
@@ -137,48 +150,21 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
137
150
  logger.info(f'Overall report table: \n{report_table} \n')
138
151
  except Exception:
139
152
  logger.error('Failed to generate report table.')
140
-
141
153
  # Clean up
142
- if base_model is not None:
154
+ if model is not None:
143
155
  import gc
144
- import torch
145
156
 
146
- del base_model
157
+ del model
147
158
  del evaluators
148
- torch.cuda.empty_cache()
149
159
  gc.collect()
150
160
 
151
- return eval_results
152
-
161
+ from evalscope.utils.import_utils import check_import
162
+ if check_import('torch'):
163
+ import torch
164
+ if torch.cuda.is_available():
165
+ torch.cuda.empty_cache()
153
166
 
154
- def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
155
- """Create an evaluator object for the specified dataset."""
156
- from evalscope.benchmarks import Benchmark, BenchmarkMeta
157
- from evalscope.evaluator import Evaluator
158
- from evalscope.models import initialize_model_adapter
159
-
160
- benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
161
-
162
- if dataset_name == DataCollection.NAME:
163
- # EvaluatorCollection is a collection of evaluators
164
- from evalscope.collections import EvaluatorCollection
165
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
166
- return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
167
-
168
- # Initialize data adapter first to update config
169
- data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
170
- # Initialize model adapter
171
- model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
172
-
173
- # update task_cfg.dataset_args
174
- task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
175
-
176
- return Evaluator(
177
- data_adapter=data_adapter,
178
- model_adapter=model_adapter,
179
- outputs=outputs,
180
- task_cfg=task_cfg,
181
- )
167
+ return eval_results
182
168
 
183
169
 
184
170
  def main():
evalscope/summarizer.py CHANGED
@@ -80,7 +80,7 @@ class Summarizer:
80
80
 
81
81
  summary_file_path = summary_files[0]
82
82
  # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
83
- summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
83
+ summary_res: List[dict] = csv_to_list(summary_file_path)
84
84
  final_res_list.extend(summary_res)
85
85
  elif eval_backend == EvalBackend.VLM_EVAL_KIT:
86
86
  eval_config = Summarizer.parse_eval_config(candidate_task)
@@ -1,9 +1,5 @@
1
- import os
2
1
  from dataclasses import dataclass
3
- from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template
4
2
 
5
- # 设置GPU环境变量
6
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
3
 
8
4
  @dataclass
9
5
  class SwiftInferArgs:
@@ -7,9 +7,22 @@ from .import_utils import _LazyModule
7
7
  if TYPE_CHECKING:
8
8
  from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
9
9
  from .deprecation_utils import deprecated
10
+ from .function_utils import run_once, thread_safe
10
11
  from .import_utils import get_module_path, is_module_installed
11
- from .io_utils import (OutputsStructure, csv_to_jsonl, csv_to_list, dict_to_yaml, gen_hash, get_latest_folder_path,
12
- get_valid_list, json_to_dict, jsonl_to_csv, jsonl_to_list, yaml_to_dict)
12
+ from .io_utils import (
13
+ OutputsStructure,
14
+ csv_to_jsonl,
15
+ csv_to_list,
16
+ dict_to_yaml,
17
+ gen_hash,
18
+ get_latest_folder_path,
19
+ get_valid_list,
20
+ json_to_dict,
21
+ jsonl_to_csv,
22
+ jsonl_to_list,
23
+ safe_filename,
24
+ yaml_to_dict,
25
+ )
13
26
  from .logger import configure_logging, get_logger
14
27
  from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
15
28
 
@@ -31,6 +44,10 @@ else:
31
44
  'is_module_installed',
32
45
  'get_module_path',
33
46
  ],
47
+ 'function_utils': [
48
+ 'thread_safe',
49
+ 'run_once',
50
+ ],
34
51
  'io_utils': [
35
52
  'OutputsStructure',
36
53
  'csv_to_list',
@@ -44,6 +61,8 @@ else:
44
61
  'jsonl_to_list',
45
62
  'gen_hash',
46
63
  'get_valid_list',
64
+ 'safe_filename',
65
+ 'thread_safe',
47
66
  ],
48
67
  'deprecation_utils': [
49
68
  'deprecated',
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import time
3
- import torch
4
3
  from contextlib import contextmanager
5
4
  from functools import partial
6
5
  from pydantic import BaseModel, Field
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
95
94
  class ChatService:
96
95
 
97
96
  def __init__(self, model_path, attn_implementation):
97
+ import torch
98
98
  from modelscope import AutoModelForCausalLM, AutoTokenizer
99
99
  from transformers import TextIteratorStreamer
100
100
 
@@ -204,7 +204,8 @@ class ChatService:
204
204
 
205
205
  def _prepare_chat_inputs(self, request: ChatCompletionRequest):
206
206
  formatted_prompt = self.tokenizer.apply_chat_template(
207
- request.messages, tokenize=False, add_generation_prompt=True)
207
+ request.messages, tokenize=False, add_generation_prompt=True
208
+ )
208
209
  inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
209
210
  prompt_tokens = len(inputs['input_ids'][0])
210
211
  return formatted_prompt, inputs, prompt_tokens
@@ -1,5 +1,6 @@
1
1
  import functools
2
2
  import inspect
3
+ import os
3
4
  from typing import Callable, Optional
4
5
 
5
6
  from .logger import get_logger
@@ -22,7 +23,7 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
22
23
  @functools.wraps(func)
23
24
  def wrapper(*args, **kwargs):
24
25
  # Get the file name where the function is defined
25
- file_name = inspect.getfile(func)
26
+ file_name = os.path.basename(inspect.getfile(func))
26
27
 
27
28
  # Construct the warning message
28
29
  warning_parts = [
@@ -40,3 +41,13 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
40
41
  return wrapper
41
42
 
42
43
  return decorator
44
+
45
+
46
+ def deprecated_warning(logger, message: str):
47
+ """
48
+ Log a deprecation warning.
49
+
50
+ :param logger: Logger instance to log the warning
51
+ :param message: Warning message to log
52
+ """
53
+ logger.warning(f'Deprecated: {message}')
@@ -0,0 +1,29 @@
1
+ import threading
2
+ from functools import wraps
3
+
4
+
5
+ def run_once(func):
6
+ """Decorator to ensure a function is only run once."""
7
+ has_run = False
8
+ result = None
9
+
10
+ def wrapper(*args, **kwargs):
11
+ nonlocal has_run, result
12
+ if not has_run:
13
+ result = func(*args, **kwargs)
14
+ has_run = True
15
+ return result
16
+
17
+ return wrapper
18
+
19
+
20
+ def thread_safe(func):
21
+ """Thread-safe decorator for functions that need to be executed in a thread-safe manner."""
22
+ lock = threading.RLock()
23
+
24
+ @wraps(func)
25
+ def wrapper(*args, **kwargs):
26
+ with lock:
27
+ return func(*args, **kwargs)
28
+
29
+ return wrapper
@@ -5,13 +5,35 @@ import importlib
5
5
  import os
6
6
  from itertools import chain
7
7
  from types import ModuleType
8
- from typing import Any
8
+ from typing import Any, Optional, Union
9
9
 
10
10
  from .logger import get_logger
11
11
 
12
12
  logger = get_logger() # pylint: disable=invalid-name
13
13
 
14
14
 
15
+ def check_import(module_name: str, package: Optional[str] = None, raise_error: bool = False) -> bool:
16
+ """Check if a module can be imported.
17
+
18
+ Args:
19
+ module_name (str): The name of the module to check.
20
+ package (str, optional): The package to install if the module is not found. Defaults to None.
21
+ raise_error (bool, optional): Whether to raise an error if the module is not found. Defaults to False.
22
+ """
23
+ try:
24
+ importlib.import_module(module_name)
25
+ return True
26
+ except ImportError:
27
+ error_msg = f'`{module_name}` not found.'
28
+ if package:
29
+ error_msg += f' Please run `pip install {package}` to use this feature.'
30
+ logger.warning(error_msg)
31
+
32
+ if raise_error:
33
+ raise ImportError(error_msg)
34
+ return False
35
+
36
+
15
37
  class _LazyModule(ModuleType):
16
38
  """
17
39
  Module class that surfaces all objects but only performs associated imports when the objects are requested.
@@ -1,10 +1,13 @@
1
1
  import base64
2
2
  import csv
3
3
  import hashlib
4
+ import io
4
5
  import json
5
6
  import jsonlines as jsonl
6
7
  import os
7
8
  import re
9
+ import string
10
+ import unicodedata
8
11
  import yaml
9
12
  from io import BytesIO
10
13
  from PIL import Image
@@ -33,7 +36,7 @@ class OutputsStructure:
33
36
  'configs_dir': None
34
37
  }
35
38
 
36
- def _get_dir(self, attr_name, dir_name):
39
+ def _get_dir(self, attr_name, dir_name) -> str:
37
40
  if self._dirs[attr_name] is None:
38
41
  dir_path = os.path.join(self.outputs_dir, dir_name)
39
42
  if self.is_make:
@@ -72,10 +75,20 @@ def jsonl_to_list(jsonl_file):
72
75
  Returns:
73
76
  list: list of lines. Each line is a dict.
74
77
  """
75
- res_list = []
76
- with jsonl.open(jsonl_file, mode='r') as reader:
77
- for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
78
- res_list.append(line)
78
+ try:
79
+ res_list = []
80
+ with jsonl.open(jsonl_file, mode='r') as reader:
81
+ for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
82
+ res_list.append(line)
83
+ except Exception:
84
+ # Fallback to reading line by line
85
+ res_list = []
86
+ with open(jsonl_file, 'r', encoding='utf-8') as f:
87
+ for line in f:
88
+ if line.strip(): # Skip empty lines
89
+ res_list.append(json.loads(line.strip()))
90
+ if not res_list:
91
+ logger.warning(f'No data found in {jsonl_file}.')
79
92
  return res_list
80
93
 
81
94
 
@@ -271,8 +284,131 @@ def get_valid_list(input_list, candidate_list):
271
284
  [i for i in input_list if i not in candidate_list]
272
285
 
273
286
 
274
- def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
287
+ def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
288
+ """
289
+ Convert a PIL Image to a base64 encoded string.
290
+
291
+ Args:
292
+ image (Image.Image): The PIL Image to convert.
293
+ format (str): The format to save the image in. Default is 'JPEG'.
294
+ add_header (bool): Whether to add the base64 header. Default is False.
295
+
296
+ Returns:
297
+ str: Base64 encoded string of the image.
298
+ """
275
299
  buffered = BytesIO()
276
300
  image.save(buffered, format=format)
277
301
  img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
302
+ if add_header:
303
+ img_str = f'data:image/{format.lower()};base64,{img_str}'
304
+ return img_str
305
+
306
+
307
+ def bytes_to_base64(bytes_data: bytes, format: str = 'png', add_header: bool = False) -> str:
308
+ """Convert image bytes to a base64 encoded string.
309
+
310
+ Args:
311
+ bytes_data (bytes): The bytes to convert.
312
+ add_header (bool): Whether to add the base64 header. Default is False.
313
+
314
+ Returns:
315
+ str: Base64 encoded string of the bytes.
316
+ """
317
+ img_str = base64.b64encode(bytes_data).decode('utf-8')
318
+ if add_header:
319
+ img_str = f'data:image/{format};base64,{img_str}'
278
320
  return img_str
321
+
322
+
323
+ def base64_to_PIL(base64_str):
324
+ """Convert a base64 encoded string to a PIL Image.
325
+
326
+ Args:
327
+ base64_str (str): The base64 encoded string.
328
+
329
+ Returns:
330
+ Image.Image: The decoded PIL Image.
331
+ """
332
+ # remove header
333
+ if ',' in base64_str:
334
+ base64_str = base64_str.split(',', 1)[1]
335
+
336
+ # decode
337
+ img_data = base64.b64decode(base64_str)
338
+ img_file = io.BytesIO(img_data)
339
+ img = Image.open(img_file)
340
+ return img
341
+
342
+
343
+ def safe_filename(s: str, max_length: int = 255) -> str:
344
+ """
345
+ Convert a string into a safe filename by removing or replacing unsafe characters.
346
+
347
+ Args:
348
+ s (str): The input string to convert
349
+ max_length (int): Maximum length of the resulting filename (default 255)
350
+
351
+ Returns:
352
+ str: A safe filename string
353
+
354
+ Examples:
355
+ >>> safe_filename("Hello/World?.txt")
356
+ 'Hello_World.txt'
357
+ """
358
+ # normalize unicode characters
359
+ s = unicodedata.normalize('NFKD', s)
360
+ s = s.encode('ASCII', 'ignore').decode('ASCII')
361
+
362
+ # remove or replace unsafe characters
363
+ # Keep only alphanumeric characters, dots, dashes, and underscores
364
+ safe_chars = string.ascii_letters + string.digits + '.-_'
365
+ s = ''.join(c if c in safe_chars else '_' for c in s)
366
+
367
+ # remove consecutive underscores
368
+ s = re.sub(r'_+', '_', s)
369
+
370
+ # remove leading/trailing periods and underscores
371
+ s = s.strip('._')
372
+
373
+ # handle empty string case
374
+ if not s:
375
+ s = 'untitled'
376
+
377
+ # handle starting with a period (hidden files)
378
+ if s.startswith('.'):
379
+ s = '_' + s
380
+
381
+ # enforce length limit
382
+ if len(s) > max_length:
383
+ # If we need to truncate, preserve the file extension if present
384
+ name, ext = os.path.splitext(s)
385
+ ext_len = len(ext)
386
+ if ext_len > 0:
387
+ max_name_length = max_length - ext_len
388
+ s = name[:max_name_length] + ext
389
+ else:
390
+ s = s[:max_length]
391
+
392
+ return s
393
+
394
+
395
+ def convert_numpy_types(obj):
396
+ """Recursively convert numpy types to native Python types for JSON serialization."""
397
+ import numpy as np
398
+
399
+ if isinstance(obj, np.bool_):
400
+ return bool(obj)
401
+ elif isinstance(obj, np.integer):
402
+ return int(obj)
403
+ elif isinstance(obj, np.floating):
404
+ return float(obj)
405
+ elif isinstance(obj, np.ndarray):
406
+ return obj.tolist()
407
+ elif isinstance(obj, dict):
408
+ return {key: convert_numpy_types(value) for key, value in obj.items()}
409
+ elif isinstance(obj, list):
410
+ return [convert_numpy_types(item) for item in obj]
411
+ elif isinstance(obj, tuple):
412
+ return tuple(convert_numpy_types(item) for item in obj)
413
+ else:
414
+ return obj