evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/config.py CHANGED
@@ -1,85 +1,253 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
2
+ # flake8: noqa: E501
3
3
  import copy
4
- import json
5
4
  import os
6
5
  from argparse import Namespace
7
6
  from dataclasses import dataclass, field
8
7
  from typing import Dict, List, Optional, Union
9
8
 
10
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
11
- from evalscope.models.custom import CustomModel
12
- from evalscope.utils import gen_hash
13
- from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
9
+ from evalscope.api.model import GenerateConfig, Model, ModelAPI
10
+ from evalscope.constants import (
11
+ DEFAULT_DATASET_CACHE_DIR,
12
+ DEFAULT_WORK_DIR,
13
+ EvalBackend,
14
+ EvalType,
15
+ HubType,
16
+ JudgeStrategy,
17
+ ModelTask,
18
+ )
19
+ from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
20
+ from evalscope.utils.deprecation_utils import deprecated_warning
21
+ from evalscope.utils.import_utils import check_import
22
+ from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
14
23
  from evalscope.utils.logger import get_logger
24
+ from evalscope.version import __version__ as evalscope_version
15
25
 
16
26
  logger = get_logger()
17
27
 
18
- cur_path = os.path.dirname(os.path.abspath(__file__))
19
-
20
- DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
21
- DEFAULT_GENERATION_CONFIG = {
22
- 'max_length': 2048,
23
- 'max_new_tokens': 512,
24
- 'do_sample': False,
25
- 'top_k': 50,
26
- 'top_p': 1.0,
27
- 'temperature': 1.0,
28
- }
29
-
30
28
 
31
29
  @dataclass
32
- class TaskConfig:
30
+ class TaskConfig(BaseArgument):
33
31
  # Model-related arguments
34
- model: Union[str, 'CustomModel', None] = None
32
+ model: Optional[Union[str, Model, ModelAPI]] = None
33
+ """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
34
+
35
35
  model_id: Optional[str] = None
36
- model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
36
+ """Unique identifier for the model. Auto-generated from model name if not provided."""
37
+
38
+ model_args: Dict = field(default_factory=dict)
39
+ """Additional arguments to pass to the model during initialization."""
40
+
41
+ model_task: str = ModelTask.TEXT_GENERATION
42
+ """The type of task the model performs (e.g., text generation, image generation)."""
37
43
 
38
44
  # Template-related arguments
39
- template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
40
45
  chat_template: Optional[str] = None
46
+ """Chat template to use for formatting conversations with the model."""
41
47
 
42
48
  # Dataset-related arguments
43
49
  datasets: List[str] = field(default_factory=list)
50
+ """List of dataset names to evaluate the model on."""
51
+
44
52
  dataset_args: Dict = field(default_factory=dict)
53
+ """Additional arguments to pass to datasets during loading."""
54
+
45
55
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
56
+ """Directory where datasets are cached locally."""
57
+
46
58
  dataset_hub: str = HubType.MODELSCOPE
59
+ """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
60
+
61
+ repeats: int = 1
62
+ """Number of times to repeat the dataset items for k-metrics evaluation."""
47
63
 
48
64
  # Generation configuration arguments
49
- generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
65
+ generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
66
+ """Configuration parameters for text/image generation."""
50
67
 
51
68
  # Evaluation-related arguments
52
69
  eval_type: str = EvalType.CHECKPOINT
70
+ """Type of evaluation: checkpoint, service, or mock."""
71
+
53
72
  eval_backend: str = EvalBackend.NATIVE
73
+ """Backend framework to use for evaluation."""
74
+
54
75
  eval_config: Union[str, Dict, None] = None
55
- stage: str = EvalStage.ALL
56
- limit: Optional[int] = None
76
+ """Additional evaluation configuration parameters."""
77
+
78
+ limit: Optional[Union[int, float]] = None
79
+ """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
80
+
81
+ eval_batch_size: int = 1
82
+ """Batch size for evaluation processing."""
57
83
 
58
84
  # Cache and working directory arguments
59
- mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
60
85
  use_cache: Optional[str] = None
86
+ """Whether to use cached results and which cache strategy to apply."""
87
+
88
+ rerun_review: bool = False
89
+ """Whether to rerun the review process even if results exist."""
90
+
61
91
  work_dir: str = DEFAULT_WORK_DIR
62
- outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
92
+ """Working directory for storing evaluation results and temporary files."""
63
93
 
64
94
  # Debug and runtime mode arguments
95
+ ignore_errors: bool = False
96
+ """Whether to continue evaluation when encountering errors."""
97
+
65
98
  debug: bool = False
66
- dry_run: bool = False
99
+ """Enable debug mode for detailed logging and error reporting."""
100
+
67
101
  seed: Optional[int] = 42
68
- api_url: Optional[str] = None # Only used for server model
69
- api_key: Optional[str] = 'EMPTY' # Only used for server model
102
+ """Random seed for reproducible results."""
103
+
104
+ api_url: Optional[str] = None
105
+ """API endpoint URL for server-based model evaluation."""
106
+
107
+ api_key: Optional[str] = 'EMPTY'
108
+ """API key for authenticating with server-based models."""
109
+
110
+ timeout: Optional[float] = None
111
+ """Request timeout in seconds for server-based models."""
112
+
113
+ stream: Optional[bool] = None
114
+ """Whether to use streaming responses for server-based models."""
115
+
116
+ # LLMJudge arguments
117
+ judge_strategy: str = JudgeStrategy.AUTO
118
+ """Strategy for LLM-based judgment (auto, single, pairwise)."""
119
+
120
+ judge_worker_num: int = 1
121
+ """Number of worker processes for parallel LLM judging."""
122
+
123
+ judge_model_args: Optional[Dict] = field(default_factory=dict)
124
+ """Additional arguments for the judge model configuration."""
125
+
126
+ analysis_report: bool = False
127
+ """Whether to generate detailed analysis reports after evaluation."""
128
+
129
+ # Sandbox configuration arguments
130
+ use_sandbox: bool = False
131
+ """Whether to execute code in a sandboxed environment."""
132
+
133
+ sandbox_type: Optional[str] = 'docker'
134
+ """Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
135
+
136
+ sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
137
+ """Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
138
+
139
+ sandbox_config: Optional[Dict] = field(default_factory=dict)
140
+ """Configuration for sandboxed code execution environments."""
141
+
142
+ evalscope_version: Optional[str] = evalscope_version
143
+ """EvalScope version used for the evaluation."""
70
144
 
71
145
  def __post_init__(self):
72
- if (not self.model_id) and self.model:
73
- if isinstance(self.model, CustomModel):
74
- self.model_id = type(self.model).__name__
146
+ self.__init_model_and_id()
147
+
148
+ self.__init_eval_data_config()
149
+
150
+ # Set default generation_config and model_args
151
+ self.__init_default_generation_config()
152
+ self.__init_default_model_args()
153
+ self.__init_default_sandbox_config()
154
+
155
+ def __init_model_and_id(self):
156
+ # Set model to DummyCustomModel if not provided
157
+ if self.model is None:
158
+ self.model = self.model_task
159
+ self.eval_type = EvalType.MOCK_LLM
160
+
161
+ # Set model_id if not provided
162
+ if not self.model_id:
163
+ if isinstance(self.model, str):
164
+ self.model_id = safe_filename(os.path.basename(self.model))
165
+ elif isinstance(self.model, Model):
166
+ self.model_id = safe_filename(self.model.name)
167
+ elif isinstance(self.model, ModelAPI):
168
+ self.model_id = safe_filename(self.model.model_name)
75
169
  else:
76
- self.model_id = os.path.basename(self.model).rstrip(os.sep)
77
-
78
- def to_dict(self):
79
- return self.__dict__
170
+ self.model_id = 'dummy_model'
171
+
172
+ def __init_eval_data_config(self):
173
+ # Post process limit
174
+ if self.limit is not None:
175
+ self.limit = parse_int_or_float(self.limit)
176
+
177
+ def __init_default_generation_config(self):
178
+ if not self.generation_config:
179
+ if self.model_task == ModelTask.IMAGE_GENERATION:
180
+ self.generation_config = {
181
+ 'height': 1024,
182
+ 'width': 1024,
183
+ 'num_inference_steps': 50,
184
+ 'guidance_scale': 9.0,
185
+ }
186
+ if self.eval_batch_size != 1:
187
+ logger.warning(
188
+ 'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
189
+ )
190
+ self.eval_batch_size = 1
191
+ elif self.model_task == ModelTask.TEXT_GENERATION:
192
+ if self.eval_type == EvalType.CHECKPOINT:
193
+ self.generation_config = {
194
+ 'max_tokens': 2048,
195
+ 'do_sample': False,
196
+ 'top_k': 50,
197
+ 'top_p': 1.0,
198
+ 'temperature': 1.0,
199
+ 'n': 1,
200
+ }
201
+ elif self.eval_type == EvalType.SERVICE:
202
+ self.generation_config = {
203
+ 'temperature': 0.0,
204
+ }
205
+ if isinstance(self.generation_config, dict):
206
+ self.generation_config = GenerateConfig.model_validate(self.generation_config)
207
+
208
+ # Set eval_batch_size to generation_config.batch_size
209
+ self.generation_config.batch_size = self.eval_batch_size
210
+
211
+ # Set default values for generation_config
212
+ if self.timeout is not None:
213
+ deprecated_warning(
214
+ logger,
215
+ 'The `timeout` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.timeout` instead.'
216
+ )
217
+ self.generation_config.timeout = self.timeout
218
+
219
+ if self.stream is not None:
220
+ deprecated_warning(
221
+ logger,
222
+ 'The `stream` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.stream` instead.'
223
+ )
224
+ self.generation_config.stream = self.stream
225
+
226
+ if self.generation_config.n is not None and self.generation_config.n > 1:
227
+ self.repeats = self.generation_config.n
228
+ self.generation_config.n = 1
229
+ deprecated_warning(
230
+ logger,
231
+ 'The `n` parameter in generation_config is deprecated and will be removed in v2.0.0. Use `TaskConfig.repeats` instead.'
232
+ )
233
+
234
+ def __init_default_model_args(self):
235
+ if self.model_args:
236
+ return
237
+ if self.model_task == ModelTask.TEXT_GENERATION:
238
+ if self.eval_type == EvalType.CHECKPOINT:
239
+ self.model_args = {
240
+ 'revision': 'master',
241
+ 'precision': 'torch.float16',
242
+ }
243
+
244
+ def __init_default_sandbox_config(self):
245
+ if not self.use_sandbox:
246
+ return
247
+ check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
80
248
 
81
- def __str__(self):
82
- return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
249
+ if not self.sandbox_type:
250
+ self.sandbox_type = 'docker'
83
251
 
84
252
  def update(self, other: Union['TaskConfig', dict]):
85
253
  if isinstance(other, TaskConfig):
@@ -95,91 +263,16 @@ class TaskConfig:
95
263
  except Exception as e:
96
264
  logger.warning(f'Failed to dump overall task config: {e}')
97
265
 
98
- @staticmethod
99
- def list():
100
- return list(registry_tasks.keys())
101
-
102
- @staticmethod
103
- def from_yaml(yaml_file: str):
104
- return TaskConfig.from_dict(yaml_to_dict(yaml_file))
105
-
106
- @staticmethod
107
- def from_dict(d: dict):
108
- return TaskConfig(**d)
109
-
110
- @staticmethod
111
- def from_json(json_file: str):
112
- return TaskConfig.from_dict(json_to_dict(json_file))
113
-
114
- @staticmethod
115
- def from_args(args: Namespace):
116
- # Convert Namespace to a dictionary and filter out None values
117
- args_dict = {k: v for k, v in vars(args).items() if v is not None}
118
-
119
- if 'func' in args_dict:
120
- del args_dict['func'] # Note: compat CLI arguments
121
-
122
- return TaskConfig.from_dict(args_dict)
123
-
124
- @staticmethod
125
- def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
126
- res_list = []
127
- for task_name in tasks:
128
- task = registry_tasks.get(task_name, None)
129
- if task is None:
130
- logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
131
- continue
132
-
133
- task.model = custom_model
134
- task.model_args = custom_model.config
135
- task.model_id = type(custom_model).__name__
136
- res_list.append(task)
137
-
138
- return res_list
139
-
140
- @staticmethod
141
- def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
142
- """
143
- Register a new task (dataset) for evaluation.
144
-
145
- Args:
146
- name: str, the dataset name.
147
- data_pattern: str, the data pattern for the task.
148
- e.g. `mmlu`, `ceval`, `gsm8k`, ...
149
- refer to task_config.list() for all available datasets.
150
- dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
151
- then your specific custom dataset directory will be /path/to/data/{name}
152
- subset_list: list, the subset list for the dataset.
153
- e.g. ['middle_school_politics', 'operating_system']
154
- refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
155
- """
156
- available_datasets = list(registry_tasks.keys())
157
- if data_pattern not in available_datasets:
158
- logger.error(
159
- f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
160
- return
161
-
162
- # Reuse the existing task config and update the datasets
163
- pattern_config = registry_tasks[data_pattern]
164
-
165
- custom_config = copy.deepcopy(pattern_config)
166
- custom_config.datasets = [data_pattern]
167
- custom_config.dataset_args = {data_pattern: {}}
168
- custom_config.eval_type = EvalType.CHECKPOINT
169
-
170
- if dataset_dir is not None:
171
- custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
172
-
173
- if subset_list is not None:
174
- custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
175
-
176
- registry_tasks.update({name: custom_config})
177
- logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
178
-
266
+ def to_dict(self):
267
+ result = copy.copy(self.__dict__)
268
+ del result['api_key'] # Do not expose api_key in the config
179
269
 
180
- tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
270
+ if isinstance(self.model, (Model, ModelAPI)):
271
+ result['model'] = self.model.__class__.__name__
181
272
 
182
- registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
273
+ if isinstance(self.generation_config, GenerateConfig):
274
+ result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
275
+ return result
183
276
 
184
277
 
185
278
  def parse_task_config(task_cfg) -> TaskConfig:
@@ -193,36 +286,14 @@ def parse_task_config(task_cfg) -> TaskConfig:
193
286
  logger.info('Args: Task config is provided with CommandLine type.')
194
287
  task_cfg = TaskConfig.from_args(task_cfg)
195
288
  elif isinstance(task_cfg, str):
196
- extension = task_cfg.split('.')[-1]
289
+ extension = os.path.splitext(task_cfg)[-1]
197
290
  logger.info(f'Args: Task config is provided with {extension} file type.')
198
- if extension in ['yaml', 'yml']:
291
+ if extension in ['.yaml', '.yml']:
199
292
  task_cfg = TaskConfig.from_yaml(task_cfg)
200
- elif extension == 'json':
293
+ elif extension == '.json':
201
294
  task_cfg = TaskConfig.from_json(task_cfg)
202
295
  else:
203
296
  raise ValueError('Args: Unsupported file extension.')
204
297
  else:
205
298
  raise ValueError('Args: Please provide a valid task config.')
206
299
  return task_cfg
207
-
208
-
209
- class TempModel(CustomModel):
210
-
211
- def __init__(self, config: dict):
212
- super().__init__(config=config)
213
-
214
- def predict(self, prompts: str, **kwargs):
215
- return [item + ': response' for item in prompts]
216
-
217
-
218
- if __name__ == '__main__':
219
- model = TempModel(config={'model_id': 'test-swift-dummy-model'})
220
- task_config = TaskConfig()
221
-
222
- # Register a new task
223
- TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
224
-
225
- swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
226
- for item in swift_eval_task:
227
- print(item)
228
- print()
evalscope/constants.py CHANGED
@@ -1,12 +1,22 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa
3
+ import os
4
+
5
+ os.environ['MODELSCOPE_LOG_LEVEL'] = '40' # Set default log level to ERROR
6
+
2
7
  from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
3
8
  from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
4
9
 
5
10
  DEFAULT_WORK_DIR = './outputs'
6
11
  DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
7
- DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
8
- DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
12
+ DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub/models
13
+ DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/hub/datasets
9
14
  DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
15
+ DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
16
+ os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
17
+ ) # ~/.cache/evalscope
18
+ IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
19
+ HEARTBEAT_INTERVAL_SEC = 60 # 60 seconds
10
20
 
11
21
 
12
22
  class HubType:
@@ -36,47 +46,17 @@ class MetricsConstant:
36
46
  ]
37
47
 
38
48
 
39
- class MetricMembers:
40
-
41
- # Math accuracy metric
42
- MATH_ACCURACY = 'math_accuracy'
43
-
44
- # Code pass@k metric
45
- CODE_PASS_K = 'code_pass_k'
46
-
47
- # Code rouge metric
48
- ROUGE = 'rouge'
49
-
50
- # ELO rating system for pairwise comparison
51
- ELO = 'elo'
52
-
53
- # Pairwise comparison win/lose and tie(optional)
54
- PAIRWISE = 'pairwise'
55
-
56
- # Rating score for single model
57
- SCORE = 'score'
58
-
59
-
60
49
  class ArenaWinner:
61
50
 
62
51
  MODEL_A = 'model_a'
63
-
64
52
  MODEL_B = 'model_b'
65
-
66
53
  TIE = 'tie'
67
-
68
54
  TIE_BOTH_BAD = 'tie_both_bad'
69
-
70
55
  UNKNOWN = 'unknown'
71
56
 
72
57
 
73
- class ArenaMode:
74
- SINGLE = 'single'
75
- PAIRWISE = 'pairwise'
76
- PAIRWISE_BASELINE = 'pairwise_baseline'
77
-
78
-
79
58
  class AnswerKeys:
59
+ INDEX = 'index'
80
60
  ANSWER_ID = 'answer_id'
81
61
  RAW_INPUT = 'raw_input'
82
62
  ORIGIN_PROMPT = 'origin_prompt'
@@ -85,58 +65,22 @@ class AnswerKeys:
85
65
  CHOICES = 'choices'
86
66
 
87
67
 
88
- class ReviewKeys:
89
- REVIEW_ID = 'review_id'
90
- REVIEWED = 'reviewed'
91
- REVIEWER_SPEC = 'reviewer_spec'
92
- REVIEW_TIME = 'review_time'
93
- MESSAGE = 'message'
94
- CONTENT = 'content'
95
- GOLD = 'gold'
96
- PRED = 'pred'
97
- RESULT = 'result'
98
- REVIEW = 'review'
99
-
100
-
101
- class EvalConfigKeys:
102
- CLASS_REF = 'ref'
103
- CLASS_ARGS = 'args'
104
- ENABLE = 'enable'
105
- POSITION_BIAS_MITIGATION = 'position_bias_mitigation'
106
- RANDOM_SEED = 'random_seed'
107
- FN_COMPLETION_PARSER = 'fn_completion_parser'
108
- COMPLETION_PARSER_KWARGS = 'completion_parser_kwargs'
109
- OUTPUT_FILE = 'output_file'
110
- MODEL_ID_OR_PATH = 'model_id_or_path'
111
- MODEL_REVISION = 'revision'
112
- GENERATION_CONFIG = 'generation_config'
113
- PRECISION = 'precision'
114
- TEMPLATE_TYPE = 'template_type'
115
-
116
-
117
- class FnCompletionParser:
118
- LMSYS_PARSER: str = 'lmsys_parser'
119
- RANKING_PARSER: str = 'ranking_parser'
120
-
121
-
122
- class PositionBiasMitigation:
123
- NONE = 'none'
124
- RANDOMIZE_ORDER = 'randomize_order'
125
- SWAP_POSITION = 'swap_position'
126
-
127
-
128
- class EvalStage:
129
- # Enums: `all`, `infer`, `review`
130
- ALL = 'all'
131
- INFER = 'infer'
132
- REVIEW = 'review'
133
-
134
-
135
68
  class EvalType:
136
69
 
137
70
  CUSTOM = 'custom'
138
- CHECKPOINT = 'checkpoint' # native model checkpoint
139
- SERVICE = 'service' # model service
71
+ MOCK_LLM = 'mock_llm'
72
+ CHECKPOINT = 'llm_ckpt' # native model checkpoint
73
+ SERVICE = 'openai_api' # model service
74
+ TEXT2IMAGE = 'text2image' # image generation service
75
+ IMAGE_EDITING = 'image_editing' # image editing service
76
+
77
+
78
+ class OutputType:
79
+ LOGITS = 'logits' # for logits output tasks
80
+ GENERATION = 'generation' # for text generation tasks and general tasks
81
+ MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
82
+ CONTINUOUS = 'continuous_logits' # for continuous tasks
83
+ IMAGE_GENERATION = 'image_generation' # for image generation tasks
140
84
 
141
85
 
142
86
  class EvalBackend:
@@ -149,3 +93,55 @@ class EvalBackend:
149
93
 
150
94
  class DataCollection:
151
95
  NAME = 'data_collection'
96
+ INFO = 'collection_info'
97
+
98
+
99
+ class JudgeStrategy:
100
+ AUTO = 'auto'
101
+ RULE = 'rule'
102
+ LLM = 'llm'
103
+ LLM_RECALL = 'llm_recall'
104
+
105
+
106
+ class JudgeScoreType:
107
+ NUMERIC = 'numeric' # numeric score
108
+ PATTERN = 'pattern' # pattern matching score
109
+
110
+
111
+ class ModelTask:
112
+ TEXT_GENERATION = 'text_generation'
113
+ IMAGE_GENERATION = 'image_generation'
114
+
115
+
116
+ class Tags:
117
+ KNOWLEDGE = 'Knowledge'
118
+ MULTIPLE_CHOICE = 'MCQ'
119
+ MATH = 'Math'
120
+ REASONING = 'Reasoning'
121
+ CODING = 'Coding'
122
+ CHINESE = 'Chinese'
123
+ COMMONSENSE = 'Commonsense'
124
+ QA = 'QA'
125
+ NER = 'NER'
126
+ READING_COMPREHENSION = 'ReadingComprehension'
127
+ CUSTOM = 'Custom'
128
+ INSTRUCTION_FOLLOWING = 'InstructionFollowing'
129
+ ARENA = 'Arena'
130
+ LONG_CONTEXT = 'LongContext'
131
+ RETRIEVAL = 'Retrieval'
132
+ FUNCTION_CALLING = 'FunctionCalling'
133
+ TEXT_TO_IMAGE = 'TextToImage'
134
+ IMAGE_EDITING = 'ImageEditing'
135
+ MULTI_MODAL = 'MultiModal'
136
+ MULTI_LINGUAL = 'MultiLingual'
137
+ MULTI_TURN = 'MultiTurn'
138
+ YES_NO = 'Yes/No'
139
+ HALLUCINATION = 'Hallucination'
140
+ MEDICAL = 'Medical'
141
+ AGENT = 'Agent'
142
+ MT = 'MachineTranslation'
143
+
144
+
145
+ class FileConstants:
146
+ IMAGE_PATH = 'image_path'
147
+ ID = 'id'
@@ -1,3 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.evaluator.evaluator import Evaluator
3
+ from .evaluator import DefaultEvaluator