evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,52 +0,0 @@
1
- import torch
2
- from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, Any, Optional, Union
4
-
5
- from evalscope.constants import EvalType
6
- from evalscope.models.custom import CustomModel
7
- from evalscope.models.local_model import LocalModel
8
-
9
- if TYPE_CHECKING:
10
- from evalscope.config import TaskConfig
11
-
12
-
13
- class BaseModelAdapter(ABC):
14
-
15
- def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
16
- if model is None:
17
- self.model_cfg = kwargs.get('model_cfg', None)
18
- elif isinstance(model, LocalModel):
19
- self.model = model.model
20
- self.model_id = model.model_id
21
- self.model_revision = model.model_revision
22
- self.device = model.device
23
- self.tokenizer = model.tokenizer
24
- self.model_cfg = model.model_cfg
25
- elif isinstance(model, CustomModel):
26
- self.model_cfg = model.config
27
- else:
28
- raise ValueError(f'Unsupported model type: {type(model)}')
29
-
30
- @abstractmethod
31
- @torch.no_grad()
32
- def predict(self, *args, **kwargs) -> Any:
33
- raise NotImplementedError
34
-
35
-
36
- def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseModelAdapter', base_model: 'LocalModel'):
37
- """Initialize the model adapter based on the task configuration."""
38
- if task_cfg.dry_run:
39
- from evalscope.models.model import DummyChatModel
40
- return DummyChatModel(model_cfg=dict())
41
- elif task_cfg.eval_type == EvalType.CUSTOM:
42
- if not isinstance(task_cfg.model, CustomModel):
43
- raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
44
- from evalscope.models import CustomModelAdapter
45
- return CustomModelAdapter(custom_model=task_cfg.model)
46
- elif task_cfg.eval_type == EvalType.SERVICE:
47
- from evalscope.models import ServerModelAdapter
48
- return ServerModelAdapter(
49
- api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key, seed=task_cfg.seed)
50
- else:
51
- return model_adapter_cls(
52
- model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
@@ -1,138 +0,0 @@
1
- import os
2
- import time
3
- import torch
4
- from typing import Union
5
-
6
- from evalscope.models.base_adapter import BaseModelAdapter
7
- from evalscope.models.local_model import LocalModel
8
- from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
- from evalscope.utils.logger import get_logger
10
- from evalscope.utils.model_utils import fix_do_sample_warning
11
-
12
- logger = get_logger()
13
-
14
-
15
- class ChatGenerationModelAdapter(BaseModelAdapter):
16
- """
17
- Chat generation model adapter.
18
- """
19
-
20
- def __init__(self, model: LocalModel, **kwargs):
21
- super().__init__(model)
22
-
23
- self.generation_config = self._parse_generation_config(self.tokenizer, self.model)
24
-
25
- custom_generation_config = kwargs.pop('generation_config', None)
26
- custom_chat_template = kwargs.pop('chat_template', None)
27
-
28
- if custom_generation_config:
29
- logger.info('Updating generation config ...')
30
- self.generation_config.update(**custom_generation_config)
31
-
32
- if custom_chat_template:
33
- self.tokenizer.chat_template = custom_chat_template
34
- logger.info(f'Using custom chat template: {custom_chat_template}')
35
-
36
- def _parse_generation_config(self, tokenizer, model):
37
- from modelscope import GenerationConfig
38
-
39
- generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
40
-
41
- try:
42
- remote_config = GenerationConfig.from_pretrained(
43
- self.model_id, revision=self.model_revision, trust_remote_code=True)
44
- generation_config.update(**remote_config.to_dict())
45
- except Exception:
46
- logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
47
-
48
- if isinstance(self.model_id, str) and os.path.exists(self.model_id):
49
- logger.warning(f'Got local model dir: {self.model_id}')
50
-
51
- if tokenizer.eos_token_id is not None:
52
- generation_config.eos_token_id = tokenizer.eos_token_id
53
- if tokenizer.pad_token_id is not None:
54
- generation_config.pad_token_id = tokenizer.pad_token_id
55
- if generation_config.max_new_tokens is None:
56
- generation_config.max_new_tokens = 2048
57
-
58
- return generation_config
59
-
60
- def _model_generate(self, query: str, system_prompt: str = None, infer_cfg: dict = {}) -> str:
61
- """
62
- Args:
63
- query: The input query.
64
- system_prompt: The system prompt.
65
- infer_cfg: The inference configuration.
66
- Returns:
67
- The prediction result.
68
- """
69
- # For chat model, use the chat template to format the input
70
- if self.tokenizer.chat_template is not None:
71
- messages = [ChatMessage(role='user', content=query)]
72
- if system_prompt:
73
- messages = [ChatMessage(role='system', content=system_prompt)] + messages
74
- formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
75
- else:
76
- # For base model, use the query as the input
77
- formatted_prompt = query
78
-
79
- inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
80
- input_ids = inputs['input_ids']
81
-
82
- # Process infer_cfg
83
- if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
84
- infer_cfg['do_sample'] = True
85
-
86
- # stop settings
87
- stop = infer_cfg.get('stop', None)
88
- eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
89
- if stop else self.tokenizer.eos_token_id
90
-
91
- if eos_token_id is not None:
92
- infer_cfg['eos_token_id'] = eos_token_id
93
- infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
94
-
95
- self.generation_config.update(**infer_cfg)
96
- fix_do_sample_warning(self.generation_config)
97
-
98
- # Run inference
99
- output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
100
-
101
- response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
102
- return response
103
-
104
- @torch.no_grad()
105
- def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
106
- """
107
- Args:
108
- inputs: The input data.
109
- infer_cfg: The inference configuration.
110
- Returns:
111
- The prediction result.
112
- """
113
-
114
- # Process inputs
115
- if isinstance(inputs, str):
116
- query = inputs
117
- system_prompt = None
118
- elif isinstance(inputs, dict):
119
- query = inputs['data'][0]
120
- system_prompt = inputs.get('system_prompt', None)
121
- elif isinstance(inputs, list):
122
- query = '\n'.join(inputs)
123
- system_prompt = None
124
- else:
125
- raise TypeError(f'Unsupported inputs type: {type(inputs)}')
126
-
127
- response = self._model_generate(query, system_prompt, infer_cfg)
128
-
129
- choices_list = [
130
- ChatCompletionResponseChoice(
131
- index=0, message=ChatMessage(content=response, role='assistant'), finish_reason='stop')
132
- ]
133
-
134
- res_d = ChatCompletionResponse(
135
- model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()),
136
- usage=None).model_dump(exclude_unset=True)
137
-
138
- return res_d
@@ -1,211 +0,0 @@
1
- import numpy as np
2
- import time
3
- import torch
4
- from typing import List
5
-
6
- from evalscope.models.base_adapter import BaseModelAdapter
7
- from evalscope.models.local_model import LocalModel
8
- from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
9
-
10
-
11
- class MultiChoiceModelAdapter(BaseModelAdapter):
12
- """ The multi-choice model adapter. """
13
-
14
- _DEFAULT_MAX_LENGTH = 2048
15
-
16
- def __init__(self, model: LocalModel, **kwargs):
17
- super().__init__(model)
18
-
19
- self._max_length = kwargs.get('max_length')
20
-
21
- @property
22
- def max_length(self):
23
- if self._max_length:
24
- return self._max_length
25
- seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
26
- for attr in seqlen_config_attrs:
27
- if hasattr(self.model.config, attr):
28
- return getattr(self.model.config, attr)
29
- if hasattr(self.tokenizer, 'model_max_length'):
30
- if self.tokenizer.model_max_length == 1000000000000000019884624838656:
31
- return self._DEFAULT_MAX_LENGTH
32
- return self.tokenizer.model_max_length
33
- return self._DEFAULT_MAX_LENGTH
34
-
35
- @torch.no_grad()
36
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
37
- """
38
- Multi-choice model prediction func.
39
-
40
- Args:
41
- inputs (dict): The inputs for a doc. Format:
42
- {'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
43
-
44
- infer_cfg (dict): inference configuration.
45
-
46
- Returns:
47
- res (dict): The model prediction results. Format:
48
- {
49
- 'choices': [
50
- {
51
- 'index': 0,
52
- 'message': {
53
- 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
54
- 'role': 'assistant'
55
- }
56
- }
57
- ],
58
- 'created': 1677664795,
59
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
60
- 'model': 'gpt-3.5-turbo-0613',
61
- 'object': 'chat.completion',
62
- 'usage': {
63
- 'completion_tokens': 17,
64
- 'prompt_tokens': 57,
65
- 'total_tokens': 74
66
- }
67
- }
68
- """
69
- infer_cfg = infer_cfg or {}
70
- self.model.generation_config.update(**infer_cfg)
71
-
72
- input_data = inputs['data']
73
- multi_choices = inputs['multi_choices']
74
-
75
- output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
76
- assert output.shape[0] == 1
77
- logits = output.flatten()
78
-
79
- choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
80
- softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
81
-
82
- if softval.dtype in {torch.bfloat16, torch.float16}:
83
- softval = softval.to(dtype=torch.float32)
84
- probs = softval.detach().cpu().numpy()
85
- pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
86
-
87
- res_d = ChatCompletionResponse(
88
- model=self.model_id,
89
- choices=[
90
- ChatCompletionResponseChoice(
91
- index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
92
- ],
93
- object='chat.completion',
94
- created=int(time.time()),
95
- usage=None).model_dump(exclude_unset=True)
96
-
97
- return res_d
98
-
99
- @staticmethod
100
- def _get_logits(tokenizer, model, inputs: List[str]):
101
- input_ids = tokenizer(inputs, padding=False)['input_ids']
102
- input_ids = torch.tensor(input_ids, device=model.device)
103
- tokens = {'input_ids': input_ids}
104
-
105
- outputs = model(input_ids)['logits']
106
- logits = outputs[:, -1, :]
107
- log_probs = torch.nn.functional.softmax(logits, dim=-1)
108
- return log_probs, {'tokens': tokens}
109
-
110
-
111
- class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
112
- """
113
- Continuation-logits model adapter.
114
- """
115
-
116
- def __init__(self, model: LocalModel, **kwargs):
117
- super().__init__(model, **kwargs)
118
-
119
- @torch.no_grad()
120
- def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
121
- """
122
- Multi-choice model prediction func.
123
- Args:
124
- inputs (dict): The inputs for a doc. Format:
125
- {'data': [(context, continuation), ...]}
126
- infer_cfg (dict): inference configuration.
127
- Returns:
128
- res (dict): The model prediction results. Format:
129
- {
130
- 'choices': [
131
- {
132
- 'index': 0,
133
- 'message': {
134
- 'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
135
- 'role': 'assistant'
136
- }
137
- }
138
- ],
139
- 'created': 1677664795,
140
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
141
- 'model': 'gpt-3.5-turbo-0613',
142
- 'object': 'chat.completion',
143
- 'usage': {
144
- 'completion_tokens': 17,
145
- 'prompt_tokens': 57,
146
- 'total_tokens': 74
147
- }
148
- }
149
- """
150
- infer_cfg = infer_cfg or {}
151
-
152
- pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
153
-
154
- res_d = ChatCompletionResponse(
155
- model=self.model_id,
156
- choices=[{
157
- 'index': 0,
158
- 'message': {
159
- 'content': pred_list,
160
- 'role': 'assistant'
161
- }
162
- }],
163
- object='chat.completion',
164
- created=int(time.time()),
165
- usage=None).model_dump(exclude_unset=True)
166
-
167
- return res_d
168
-
169
- def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
170
- self.model.generation_config.update(**infer_cfg)
171
- # To predict one doc
172
- doc_ele_pred = []
173
- for ctx, continuation in inputs:
174
-
175
- # ctx_enc shape: [context_tok_len] cont_enc shape: [continuation_tok_len]
176
- ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
177
-
178
- inputs_tokens = torch.tensor(
179
- (ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
180
- dtype=torch.long,
181
- device=self.model.device).unsqueeze(0)
182
-
183
- logits = self.model(inputs_tokens)[0]
184
- logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
185
-
186
- logits = logits[:, -len(cont_enc):, :]
187
- cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
188
- logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
189
-
190
- choice_score = float(logits.sum())
191
- doc_ele_pred.append(choice_score)
192
-
193
- # e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
194
- return doc_ele_pred
195
-
196
- def _encode_pair(self, context, continuation):
197
- n_spaces = len(context) - len(context.rstrip())
198
- if n_spaces > 0:
199
- continuation = context[-n_spaces:] + continuation
200
- context = context[:-n_spaces]
201
-
202
- whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
203
- whole_enc = torch.tensor(whole_enc, device=self.device)
204
-
205
- context_enc = self.tokenizer(context, padding=False)['input_ids']
206
- context_enc = torch.tensor(context_enc, device=self.device)
207
-
208
- context_enc_len = len(context_enc)
209
- continuation_enc = whole_enc[context_enc_len:]
210
-
211
- return context_enc, continuation_enc
@@ -1,3 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.models.custom.custom_model import *
@@ -1,53 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import torch
3
- from abc import ABC, abstractmethod
4
- from typing import Any, Dict, List, Union
5
-
6
-
7
- class CustomModel(ABC):
8
-
9
- def __init__(self, config: dict, **kwargs):
10
- self.config = config
11
- self.kwargs = kwargs
12
-
13
- if config.get('model_id', None) is None:
14
- raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
15
-
16
- @abstractmethod
17
- @torch.no_grad()
18
- def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
19
- """
20
- Model prediction function for batch inputs.
21
-
22
- Args:
23
- prompts (str): The input batch of prompts to predict.
24
-
25
- **kwargs: kwargs
26
-
27
- Returns:
28
- res (dict): The model prediction results (batch). Format:
29
- [
30
- {
31
- 'choices': [
32
- {
33
- 'index': 0,
34
- 'message': {
35
- 'content': 'xxx',
36
- 'role': 'assistant'
37
- }
38
- }
39
- ],
40
- 'created': 1677664795,
41
- 'model': 'gpt-3.5-turbo-0613', # should be model_id
42
- 'object': 'chat.completion',
43
- 'usage': {
44
- 'completion_tokens': 17,
45
- 'prompt_tokens': 57,
46
- 'total_tokens': 74
47
- }
48
- }
49
- ,
50
- ...
51
- ]
52
- """
53
- raise NotImplementedError
@@ -1,63 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
- import time
4
-
5
- from evalscope.models.custom import CustomModel
6
- from evalscope.run import run_task
7
- from evalscope.utils.io_utils import yaml_to_dict
8
- from evalscope.utils.logger import get_logger
9
-
10
- logger = get_logger()
11
- """
12
- This script is used to rewrite the evaluation results without re-running the model predictions.
13
- """
14
-
15
-
16
- class DummyCustomModel(CustomModel):
17
-
18
- def __init__(self, config: dict, **kwargs):
19
- super(DummyCustomModel, self).__init__(config=config, **kwargs)
20
-
21
- def predict(self, prompts: str, **kwargs):
22
- # ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
23
-
24
- response = 'The answer is C. NOTE: ONLY FOR TEST'
25
-
26
- res_d: dict = {
27
- 'choices': [{
28
- 'index': 0,
29
- 'message': {
30
- # 'content': f'The answer is B. Raw prompt: {prompt}',
31
- 'content': response,
32
- 'role': 'assistant'
33
- }
34
- }],
35
- 'created':
36
- time.time(),
37
- 'model':
38
- self.config.get('model_id'), # should be model_id
39
- 'object':
40
- 'chat.completion',
41
- 'usage': {
42
- 'completion_tokens': 0,
43
- 'prompt_tokens': 0,
44
- 'total_tokens': 0
45
- }
46
- }
47
-
48
- return [res_d for _ in prompts]
49
-
50
-
51
- if __name__ == '__main__':
52
- # step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
53
- # step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
54
-
55
- swift_model = DummyCustomModel(config={'model_id': 'swift-model-dummy'})
56
-
57
- task_cfg_file = '/path/to/eval_your_model_results/configs/task_output_config.yaml'
58
-
59
- task_cfg_d = yaml_to_dict(task_cfg_file)
60
- task_cfg_d.update({'model': swift_model})
61
-
62
- eval_results: dict = run_task(task_cfg=task_cfg_d)
63
- print('** Evaluation results finished !\n')
@@ -1,67 +0,0 @@
1
- from typing import Any, Dict, List, Union
2
-
3
- from evalscope.models.base_adapter import BaseModelAdapter
4
- from evalscope.models.custom import CustomModel
5
-
6
-
7
- class CustomModelAdapter(BaseModelAdapter):
8
-
9
- def __init__(self, custom_model: CustomModel, **kwargs):
10
- """
11
- Custom model adapter.
12
-
13
- Args:
14
- custom_model: The custom model instance.
15
- **kwargs: Other args.
16
- """
17
- self.custom_model = custom_model
18
- super(CustomModelAdapter, self).__init__(model=custom_model)
19
-
20
- def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
21
- """
22
- Model prediction func.
23
-
24
- Args:
25
- inputs (Union[str, dict, list]): The input data. Depending on the specific model.
26
- str: 'xxx'
27
- dict: {'data': [full_prompt]}
28
- list: ['xxx', 'yyy', 'zzz']
29
- **kwargs: kwargs
30
-
31
- Returns:
32
- res (dict): The model prediction results. Format:
33
- {
34
- 'choices': [
35
- {
36
- 'index': 0,
37
- 'message': {
38
- 'content': 'xxx',
39
- 'role': 'assistant'
40
- }
41
- }
42
- ],
43
- 'created': 1677664795,
44
- 'model': 'gpt-3.5-turbo-0613', # should be model_id
45
- 'object': 'chat.completion',
46
- 'usage': {
47
- 'completion_tokens': 17,
48
- 'prompt_tokens': 57,
49
- 'total_tokens': 74
50
- }
51
- }
52
- """
53
- in_prompts = []
54
-
55
- # Note: here we assume the inputs are all prompts for the benchmark.
56
- for input_prompt in inputs:
57
- if isinstance(input_prompt, str):
58
- in_prompts.append(input_prompt)
59
- elif isinstance(input_prompt, dict):
60
- # TODO: to be supported for continuation list like truthful_qa
61
- in_prompts.append(input_prompt['data'][0])
62
- elif isinstance(input_prompt, list):
63
- in_prompts.append('\n'.join(input_prompt))
64
- else:
65
- raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
66
-
67
- return self.custom_model.predict(prompts=in_prompts, **kwargs)
@@ -1,74 +0,0 @@
1
- import torch
2
- from typing import TYPE_CHECKING, Optional
3
-
4
- from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
5
- from evalscope.utils.logger import get_logger
6
-
7
- if TYPE_CHECKING:
8
- from evalscope.config import TaskConfig
9
-
10
- logger = get_logger()
11
-
12
-
13
- class LocalModel:
14
-
15
- def __init__(self,
16
- model_id: str,
17
- model_revision: str = DEFAULT_MODEL_REVISION,
18
- device_map: str = 'auto',
19
- torch_dtype: str = 'auto',
20
- cache_dir: str = None,
21
- **kwargs):
22
- from modelscope import AutoModelForCausalLM, AutoTokenizer
23
-
24
- model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
25
-
26
- if isinstance(torch_dtype, str) and torch_dtype != 'auto':
27
- torch_dtype = eval(torch_dtype)
28
-
29
- self.model_id = model_id
30
- self.model_revision = model_revision
31
- self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
-
33
- self.tokenizer = AutoTokenizer.from_pretrained(
34
- self.model_id,
35
- revision=model_revision,
36
- trust_remote_code=True,
37
- cache_dir=model_cache_dir,
38
- )
39
-
40
- self.model = AutoModelForCausalLM.from_pretrained(
41
- self.model_id,
42
- revision=model_revision,
43
- device_map=device_map,
44
- trust_remote_code=True,
45
- torch_dtype=torch_dtype,
46
- cache_dir=model_cache_dir,
47
- )
48
-
49
- self.model_cfg = {
50
- 'model_id': model_id,
51
- 'device_map': device_map,
52
- 'torch_dtype': str(torch_dtype),
53
- }
54
-
55
-
56
- def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
57
- """Get the base local model for the task. If the task is not checkpoint-based, return None.
58
- Avoids loading model multiple times for different datasets.
59
- """
60
- if task_cfg.eval_type != EvalType.CHECKPOINT:
61
- return None
62
- else:
63
- device_map = task_cfg.model_args.get('device_map', 'auto')
64
- cache_dir = task_cfg.model_args.get('cache_dir', None)
65
- model_precision = task_cfg.model_args.get('precision', 'torch.float16')
66
- model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
67
-
68
- base_model = LocalModel(
69
- model_id=task_cfg.model,
70
- model_revision=model_revision,
71
- device_map=device_map,
72
- torch_dtype=model_precision,
73
- cache_dir=cache_dir)
74
- return base_model