evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,294 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages import ChatMessageUser, ContentText
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.import_utils import check_import
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ PROMPT_TEMPLATE = """
16
+ Translate the following {source_language} sentence into {target_language}:
17
+
18
+ {source_language}: {source_text}
19
+ {target_language}:
20
+ """.strip()
21
+
22
+ LANGUAGE_PAIRS = [
23
+ 'en-ar_eg',
24
+ 'en-ar_sa',
25
+ 'en-bg_bg',
26
+ 'en-bn_in',
27
+ 'en-ca_es',
28
+ 'en-cs_cz',
29
+ 'en-da_dk',
30
+ 'en-de_de',
31
+ 'en-el_gr',
32
+ 'en-es_mx',
33
+ 'en-et_ee',
34
+ 'en-fa_ir',
35
+ 'en-fi_fi',
36
+ 'en-fil_ph',
37
+ 'en-fr_ca',
38
+ 'en-fr_fr',
39
+ 'en-gu_in',
40
+ 'en-he_il',
41
+ 'en-hi_in',
42
+ 'en-hr_hr',
43
+ 'en-hu_hu',
44
+ 'en-id_id',
45
+ 'en-is_is',
46
+ 'en-it_it',
47
+ 'en-ja_jp',
48
+ 'en-kn_in',
49
+ 'en-ko_kr',
50
+ 'en-lt_lt',
51
+ 'en-lv_lv',
52
+ 'en-ml_in',
53
+ 'en-mr_in',
54
+ 'en-nl_nl',
55
+ 'en-no_no',
56
+ 'en-pa_in',
57
+ 'en-pl_pl',
58
+ 'en-pt_br',
59
+ 'en-pt_pt',
60
+ 'en-ro_ro',
61
+ 'en-ru_ru',
62
+ 'en-sk_sk',
63
+ 'en-sl_si',
64
+ 'en-sr_rs',
65
+ 'en-sv_se',
66
+ 'en-sw_ke',
67
+ 'en-sw_tz',
68
+ 'en-ta_in',
69
+ 'en-te_in',
70
+ 'en-th_th',
71
+ 'en-tr_tr',
72
+ 'en-uk_ua',
73
+ 'en-ur_pk',
74
+ 'en-vi_vn',
75
+ 'en-zh_cn',
76
+ 'en-zh_tw',
77
+ 'en-zu_za',
78
+ ]
79
+
80
+ LANGUAGE_BY_CODE = {
81
+ 'ar_eg': 'arabic',
82
+ 'ar_sa': 'arabic',
83
+ 'bg_bg': 'bulgarian',
84
+ 'bn_bd': 'bengali',
85
+ 'bn_in': 'bengali',
86
+ 'ca_es': 'catalan',
87
+ 'cs_cz': 'czech',
88
+ 'da_dk': 'danish',
89
+ 'de_de': 'german',
90
+ 'el_gr': 'greek',
91
+ 'es_mx': 'spanish',
92
+ 'et_ee': 'estonian',
93
+ 'fa_ir': 'farsi',
94
+ 'fi_fi': 'finnish',
95
+ 'fil_ph': 'filipino',
96
+ 'fr_ca': 'french',
97
+ 'fr_fr': 'french',
98
+ 'gu_in': 'gujarati',
99
+ 'he_il': 'hebrew',
100
+ 'hi_in': 'hindi',
101
+ 'hr_hr': 'croatian',
102
+ 'hu_hu': 'hungarian',
103
+ 'id_id': 'indonesian',
104
+ 'is_is': 'icelandic',
105
+ 'it_it': 'italian',
106
+ 'ja_jp': 'japanese',
107
+ 'kn_in': 'kannada',
108
+ 'ko_kr': 'korean',
109
+ 'lt_lt': 'lithuanian',
110
+ 'lv_lv': 'latvian',
111
+ 'ml_in': 'malayalam',
112
+ 'mr_in': 'marathi',
113
+ 'nl_nl': 'dutch',
114
+ 'no_no': 'norwegian',
115
+ 'pa_in': 'punjabi',
116
+ 'pl_pl': 'polish',
117
+ 'pt_br': 'portuguese',
118
+ 'pt_pt': 'portuguese',
119
+ 'ro_ro': 'romanian',
120
+ 'ru_ru': 'russian',
121
+ 'sk_sk': 'slovak',
122
+ 'sl_si': 'slovenian',
123
+ 'sr_rs': 'serbian',
124
+ 'sv_se': 'swedish',
125
+ 'sw_ke': 'swahili',
126
+ 'sw_tz': 'swahili',
127
+ 'ta_in': 'tamil',
128
+ 'te_in': 'telugu',
129
+ 'th_th': 'thai',
130
+ 'tr_tr': 'turkish',
131
+ 'uk_ua': 'ukrainian',
132
+ 'ur_pk': 'urdu',
133
+ 'vi_vn': 'vietnamese',
134
+ 'zh_cn': 'mandarin',
135
+ 'zh_tw': 'mandarin',
136
+ 'zu_za': 'zulu',
137
+ 'en': 'english',
138
+ }
139
+
140
+
141
+ @register_benchmark(
142
+ BenchmarkMeta(
143
+ name='wmt24pp',
144
+ pretty_name='WMT2024++',
145
+ dataset_id='extraordinarylab/wmt24pp',
146
+ tags=[Tags.MULTI_LINGUAL, Tags.MT],
147
+ description=(
148
+ 'WMT2024 news translation benchmark supporting multiple language pairs. '
149
+ 'Each subset represents a specific translation direction'
150
+ ),
151
+ subset_list=LANGUAGE_PAIRS,
152
+ eval_split='test',
153
+ metric_list={
154
+ 'bleu': {},
155
+ 'bert_score': {
156
+ 'model_id_or_path': 'AI-ModelScope/xlm-roberta-large',
157
+ 'model_type': 'xlm-roberta-large'
158
+ },
159
+ 'comet': {
160
+ 'model_id_or_path': 'evalscope/wmt22-comet-da',
161
+ }
162
+ },
163
+ few_shot_num=0,
164
+ prompt_template=PROMPT_TEMPLATE,
165
+ )
166
+ )
167
+ class WMT24PPAdapter(DefaultDataAdapter):
168
+
169
+ def __init__(self, **kwargs: Any) -> None:
170
+ """Initialize adapter and configure dataset subsets."""
171
+ super().__init__(**kwargs)
172
+ self.reformat_subset = True
173
+ self.use_batch_scoring = True # Enable batch scoring
174
+
175
+ if 'comet' in self.metric_list:
176
+ check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
177
+
178
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
179
+ """
180
+ Convert a data record to a Sample object.
181
+ """
182
+ source_text = str(record['source'])
183
+ target_text = str(record['target'])
184
+ language_pair = str(record['language_pair'])
185
+ source_language, target_language = language_pair.split('-')
186
+
187
+ # Format the generation prompt with the text
188
+ input_prompt = self.prompt_template.format(
189
+ source_text=source_text,
190
+ source_language=LANGUAGE_BY_CODE[source_language],
191
+ target_language=LANGUAGE_BY_CODE[target_language],
192
+ )
193
+
194
+ # Create content list for the input
195
+ content_list = [ContentText(text=input_prompt)]
196
+
197
+ return Sample(
198
+ input=[ChatMessageUser(content=content_list)],
199
+ target=target_text,
200
+ subset_key=language_pair,
201
+ metadata={
202
+ 'source_text': source_text,
203
+ 'target_text': target_text,
204
+ 'source_language': source_language,
205
+ 'target_language': target_language,
206
+ },
207
+ )
208
+
209
+ def match_score(
210
+ self,
211
+ original_prediction: str,
212
+ filtered_prediction: str,
213
+ reference: str,
214
+ task_state: TaskState,
215
+ ) -> Score:
216
+ """Compute per-sample translation metrics."""
217
+ # Create a Score object for the current sample
218
+ score = Score(
219
+ prediction=original_prediction,
220
+ extracted_prediction=filtered_prediction,
221
+ value={},
222
+ )
223
+
224
+ # ---- BLEU ----
225
+ if 'bleu' in self.metric_list:
226
+ try:
227
+ from evalscope.metrics import bleu_ngram_one_sample
228
+
229
+ bleu_results = bleu_ngram_one_sample(filtered_prediction, reference)
230
+ score.value.update(bleu_results)
231
+ except Exception as e:
232
+ logger.warning(f'[WMT24PPAdapter] BLEU single-sample calculation failed: {e}')
233
+ return score
234
+
235
+ def batch_match_score(
236
+ self,
237
+ original_predictions: List[str],
238
+ filtered_predictions: List[str],
239
+ references: List[str],
240
+ task_states: List[TaskState],
241
+ ) -> List[Score]:
242
+ """Compute batched translation metrics (BLEU, BERTScore, COMET)."""
243
+ scores: List[Score] = []
244
+ for i in range(len(original_predictions)):
245
+ score = Score(
246
+ extracted_prediction=filtered_predictions[i],
247
+ prediction=original_predictions[i],
248
+ value={},
249
+ )
250
+ scores.append(score)
251
+
252
+ # ---- BLEU (per-sample within batch) ----
253
+ if 'bleu' in self.metric_list:
254
+ try:
255
+ from evalscope.metrics import bleu_ngram_one_sample
256
+
257
+ for i in range(len(scores)):
258
+ bleu_results = bleu_ngram_one_sample(filtered_predictions[i], references[i])
259
+ scores[i].value.update(bleu_results)
260
+ except Exception as e:
261
+ logger.warning(f'[WMT24PPAdapter] BLEU batch calculation failed: {e}')
262
+
263
+ # ---- BERTScore ----
264
+ if 'bert_score' in self.metric_list:
265
+ try:
266
+ from evalscope.metrics.metric import BertScore
267
+
268
+ score_args = self.metric_list.get('bert_score', {})
269
+ bert_scorer = BertScore(**score_args)
270
+ bert_score_f1 = bert_scorer.apply(filtered_predictions, references)
271
+ for i in range(len(scores)):
272
+ scores[i].value.update({'bert_score': bert_score_f1[i]})
273
+ except Exception as e:
274
+ logger.warning(f'[WMT24PPAdapter] BERTScore batch calculation failed: {e}')
275
+
276
+ # ---- COMET ----
277
+ if 'comet' in self.metric_list:
278
+ try:
279
+ from evalscope.metrics.metric import COMETScore
280
+
281
+ score_args = self.metric_list.get('comet', {})
282
+ comet_scorer = COMETScore(**score_args)
283
+ data = [{
284
+ 'src': st.metadata.get('source_text'),
285
+ 'mt': pred,
286
+ 'ref': ref
287
+ } for pred, ref, st in zip(filtered_predictions, references, task_states)]
288
+ comet_scores = comet_scorer.apply(data)
289
+ for i in range(len(scores)):
290
+ scores[i].value.update({'comet': comet_scores[i]})
291
+ except Exception as e:
292
+ logger.warning(f'[WMT24PPAdapter] COMET batch calculation failed: {e}')
293
+
294
+ return scores
File without changes
@@ -0,0 +1,64 @@
1
+ # flake8: noqa: E501
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ # 定义提示模板
15
+ PROMPT_TEMPLATE = """{question}
16
+ \n\n\nLet's think step by step and give the final answer in curly braces,
17
+ like this: {{final answer}}"
18
+ """
19
+
20
+ SUBSET_LIST = ['default']
21
+
22
+
23
+ @register_benchmark(
24
+ BenchmarkMeta(
25
+ name='zerobench',
26
+ pretty_name='ZeroBench',
27
+ dataset_id='evalscope/zerobench',
28
+ tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
29
+ description=
30
+ 'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
31
+ subset_list=SUBSET_LIST,
32
+ metric_list=['acc'],
33
+ eval_split='zerobench',
34
+ train_split='zerobench_subquestions',
35
+ prompt_template=PROMPT_TEMPLATE,
36
+ )
37
+ )
38
+ class ZeroBenchAdapter(VisionLanguageAdapter):
39
+
40
+ def __init__(self, *args, **kwargs):
41
+ super().__init__(*args, **kwargs)
42
+
43
+ self._use_llm_judge = True
44
+
45
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
46
+ question = record['question_text']
47
+ content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
48
+ image = record['question_images_decoded']
49
+ if len(image) > 0:
50
+ for img in image:
51
+ # Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
52
+ processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
53
+ image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
54
+ content_list.append(ContentImage(image=image_base64))
55
+
56
+ metadata = {
57
+ 'question_id': record['question_id'],
58
+ 'question_images': record['question_images'],
59
+ 'image_attribution': record['image_attribution']
60
+ }
61
+
62
+ return Sample(
63
+ input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
64
+ )
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.app import create_app
31
+ try:
32
+ from evalscope.app import create_app
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import create_app from evalscope.app, due to {e}. '
36
+ "Please run `pip install 'evalscope[app]'`."
37
+ )
32
38
 
33
39
  create_app(self.args)
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
- from evalscope.perf.main import run_perf_benchmark
31
+ try:
32
+ from evalscope.perf.main import run_perf_benchmark
33
+ except ImportError as e:
34
+ raise ImportError(
35
+ f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
36
+ "Please run `pip install 'evalscope[perf]'`."
37
+ )
32
38
 
33
39
  run_perf_benchmark(self.args)
evalscope/config.py CHANGED
@@ -6,7 +6,7 @@ from argparse import Namespace
6
6
  from dataclasses import dataclass, field
7
7
  from typing import Dict, List, Optional, Union
8
8
 
9
- from evalscope.api.model import GenerateConfig
9
+ from evalscope.api.model import GenerateConfig, Model, ModelAPI
10
10
  from evalscope.constants import (
11
11
  DEFAULT_DATASET_CACHE_DIR,
12
12
  DEFAULT_WORK_DIR,
@@ -15,12 +15,13 @@ from evalscope.constants import (
15
15
  HubType,
16
16
  JudgeStrategy,
17
17
  ModelTask,
18
- OutputType,
19
18
  )
20
19
  from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
21
20
  from evalscope.utils.deprecation_utils import deprecated_warning
21
+ from evalscope.utils.import_utils import check_import
22
22
  from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
23
23
  from evalscope.utils.logger import get_logger
24
+ from evalscope.version import __version__ as evalscope_version
24
25
 
25
26
  logger = get_logger()
26
27
 
@@ -28,51 +29,118 @@ logger = get_logger()
28
29
  @dataclass
29
30
  class TaskConfig(BaseArgument):
30
31
  # Model-related arguments
31
- model: Optional[str] = None
32
+ model: Optional[Union[str, Model, ModelAPI]] = None
33
+ """The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
34
+
32
35
  model_id: Optional[str] = None
36
+ """Unique identifier for the model. Auto-generated from model name if not provided."""
37
+
33
38
  model_args: Dict = field(default_factory=dict)
39
+ """Additional arguments to pass to the model during initialization."""
40
+
34
41
  model_task: str = ModelTask.TEXT_GENERATION
42
+ """The type of task the model performs (e.g., text generation, image generation)."""
35
43
 
36
44
  # Template-related arguments
37
45
  chat_template: Optional[str] = None
46
+ """Chat template to use for formatting conversations with the model."""
38
47
 
39
48
  # Dataset-related arguments
40
49
  datasets: List[str] = field(default_factory=list)
50
+ """List of dataset names to evaluate the model on."""
51
+
41
52
  dataset_args: Dict = field(default_factory=dict)
53
+ """Additional arguments to pass to datasets during loading."""
54
+
42
55
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
56
+ """Directory where datasets are cached locally."""
57
+
43
58
  dataset_hub: str = HubType.MODELSCOPE
44
- repeats: int = 1 # Number of times to repeat the dataset items for k-metrics
59
+ """Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
60
+
61
+ repeats: int = 1
62
+ """Number of times to repeat the dataset items for k-metrics evaluation."""
45
63
 
46
64
  # Generation configuration arguments
47
65
  generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
66
+ """Configuration parameters for text/image generation."""
48
67
 
49
68
  # Evaluation-related arguments
50
69
  eval_type: str = EvalType.CHECKPOINT
70
+ """Type of evaluation: checkpoint, service, or mock."""
71
+
51
72
  eval_backend: str = EvalBackend.NATIVE
73
+ """Backend framework to use for evaluation."""
74
+
52
75
  eval_config: Union[str, Dict, None] = None
76
+ """Additional evaluation configuration parameters."""
77
+
53
78
  limit: Optional[Union[int, float]] = None
79
+ """Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
80
+
54
81
  eval_batch_size: int = 1
82
+ """Batch size for evaluation processing."""
55
83
 
56
84
  # Cache and working directory arguments
57
85
  use_cache: Optional[str] = None
86
+ """Whether to use cached results and which cache strategy to apply."""
87
+
58
88
  rerun_review: bool = False
89
+ """Whether to rerun the review process even if results exist."""
90
+
59
91
  work_dir: str = DEFAULT_WORK_DIR
92
+ """Working directory for storing evaluation results and temporary files."""
60
93
 
61
94
  # Debug and runtime mode arguments
62
95
  ignore_errors: bool = False
96
+ """Whether to continue evaluation when encountering errors."""
97
+
63
98
  debug: bool = False
64
- dry_run: bool = False
99
+ """Enable debug mode for detailed logging and error reporting."""
100
+
65
101
  seed: Optional[int] = 42
66
- api_url: Optional[str] = None # Only used for server model
67
- api_key: Optional[str] = 'EMPTY' # Only used for server model
68
- timeout: Optional[float] = None # Only used for server model
69
- stream: Optional[bool] = None # Only used for server model
102
+ """Random seed for reproducible results."""
103
+
104
+ api_url: Optional[str] = None
105
+ """API endpoint URL for server-based model evaluation."""
106
+
107
+ api_key: Optional[str] = 'EMPTY'
108
+ """API key for authenticating with server-based models."""
109
+
110
+ timeout: Optional[float] = None
111
+ """Request timeout in seconds for server-based models."""
112
+
113
+ stream: Optional[bool] = None
114
+ """Whether to use streaming responses for server-based models."""
70
115
 
71
116
  # LLMJudge arguments
72
117
  judge_strategy: str = JudgeStrategy.AUTO
118
+ """Strategy for LLM-based judgment (auto, single, pairwise)."""
119
+
73
120
  judge_worker_num: int = 1
121
+ """Number of worker processes for parallel LLM judging."""
122
+
74
123
  judge_model_args: Optional[Dict] = field(default_factory=dict)
124
+ """Additional arguments for the judge model configuration."""
125
+
75
126
  analysis_report: bool = False
127
+ """Whether to generate detailed analysis reports after evaluation."""
128
+
129
+ # Sandbox configuration arguments
130
+ use_sandbox: bool = False
131
+ """Whether to execute code in a sandboxed environment."""
132
+
133
+ sandbox_type: Optional[str] = 'docker'
134
+ """Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
135
+
136
+ sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
137
+ """Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
138
+
139
+ sandbox_config: Optional[Dict] = field(default_factory=dict)
140
+ """Configuration for sandboxed code execution environments."""
141
+
142
+ evalscope_version: Optional[str] = evalscope_version
143
+ """EvalScope version used for the evaluation."""
76
144
 
77
145
  def __post_init__(self):
78
146
  self.__init_model_and_id()
@@ -82,20 +150,22 @@ class TaskConfig(BaseArgument):
82
150
  # Set default generation_config and model_args
83
151
  self.__init_default_generation_config()
84
152
  self.__init_default_model_args()
153
+ self.__init_default_sandbox_config()
85
154
 
86
155
  def __init_model_and_id(self):
87
156
  # Set model to DummyCustomModel if not provided
88
157
  if self.model is None:
89
158
  self.model = self.model_task
90
159
  self.eval_type = EvalType.MOCK_LLM
91
- else:
92
- if self.model_task == ModelTask.IMAGE_GENERATION:
93
- self.eval_type = EvalType.TEXT2IMAGE
94
160
 
95
161
  # Set model_id if not provided
96
162
  if not self.model_id:
97
- if self.model:
163
+ if isinstance(self.model, str):
98
164
  self.model_id = safe_filename(os.path.basename(self.model))
165
+ elif isinstance(self.model, Model):
166
+ self.model_id = safe_filename(self.model.name)
167
+ elif isinstance(self.model, ModelAPI):
168
+ self.model_id = safe_filename(self.model.model_name)
99
169
  else:
100
170
  self.model_id = 'dummy_model'
101
171
 
@@ -113,6 +183,11 @@ class TaskConfig(BaseArgument):
113
183
  'num_inference_steps': 50,
114
184
  'guidance_scale': 9.0,
115
185
  }
186
+ if self.eval_batch_size != 1:
187
+ logger.warning(
188
+ 'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
189
+ )
190
+ self.eval_batch_size = 1
116
191
  elif self.model_task == ModelTask.TEXT_GENERATION:
117
192
  if self.eval_type == EvalType.CHECKPOINT:
118
193
  self.generation_config = {
@@ -125,7 +200,6 @@ class TaskConfig(BaseArgument):
125
200
  }
126
201
  elif self.eval_type == EvalType.SERVICE:
127
202
  self.generation_config = {
128
- 'max_tokens': 2048,
129
203
  'temperature': 0.0,
130
204
  }
131
205
  if isinstance(self.generation_config, dict):
@@ -138,14 +212,14 @@ class TaskConfig(BaseArgument):
138
212
  if self.timeout is not None:
139
213
  deprecated_warning(
140
214
  logger,
141
- 'The `timeout` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.timeout` instead.'
215
+ 'The `timeout` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.timeout` instead.'
142
216
  )
143
217
  self.generation_config.timeout = self.timeout
144
218
 
145
219
  if self.stream is not None:
146
220
  deprecated_warning(
147
221
  logger,
148
- 'The `stream` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.stream` instead.'
222
+ 'The `stream` parameter is deprecated and will be removed in v2.0.0. Use `generation_config.stream` instead.'
149
223
  )
150
224
  self.generation_config.stream = self.stream
151
225
 
@@ -154,7 +228,7 @@ class TaskConfig(BaseArgument):
154
228
  self.generation_config.n = 1
155
229
  deprecated_warning(
156
230
  logger,
157
- 'The `n` parameter in generation_config is deprecated and will be removed in v1.1.0. Use `TaskConfig.repeats` instead.'
231
+ 'The `n` parameter in generation_config is deprecated and will be removed in v2.0.0. Use `TaskConfig.repeats` instead.'
158
232
  )
159
233
 
160
234
  def __init_default_model_args(self):
@@ -167,6 +241,14 @@ class TaskConfig(BaseArgument):
167
241
  'precision': 'torch.float16',
168
242
  }
169
243
 
244
+ def __init_default_sandbox_config(self):
245
+ if not self.use_sandbox:
246
+ return
247
+ check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
248
+
249
+ if not self.sandbox_type:
250
+ self.sandbox_type = 'docker'
251
+
170
252
  def update(self, other: Union['TaskConfig', dict]):
171
253
  if isinstance(other, TaskConfig):
172
254
  other = other.to_dict()
@@ -182,9 +264,12 @@ class TaskConfig(BaseArgument):
182
264
  logger.warning(f'Failed to dump overall task config: {e}')
183
265
 
184
266
  def to_dict(self):
185
- result = copy.deepcopy(self.__dict__)
267
+ result = copy.copy(self.__dict__)
186
268
  del result['api_key'] # Do not expose api_key in the config
187
269
 
270
+ if isinstance(self.model, (Model, ModelAPI)):
271
+ result['model'] = self.model.__class__.__name__
272
+
188
273
  if isinstance(self.generation_config, GenerateConfig):
189
274
  result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
190
275
  return result
evalscope/constants.py CHANGED
@@ -15,6 +15,8 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old versio
15
15
  DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
16
16
  os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
17
17
  ) # ~/.cache/evalscope
18
+ IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
19
+ HEARTBEAT_INTERVAL_SEC = 60 # 60 seconds
18
20
 
19
21
 
20
22
  class HubType:
@@ -70,6 +72,7 @@ class EvalType:
70
72
  CHECKPOINT = 'llm_ckpt' # native model checkpoint
71
73
  SERVICE = 'openai_api' # model service
72
74
  TEXT2IMAGE = 'text2image' # image generation service
75
+ IMAGE_EDITING = 'image_editing' # image editing service
73
76
 
74
77
 
75
78
  class OutputType:
@@ -119,6 +122,7 @@ class Tags:
119
122
  CHINESE = 'Chinese'
120
123
  COMMONSENSE = 'Commonsense'
121
124
  QA = 'QA'
125
+ NER = 'NER'
122
126
  READING_COMPREHENSION = 'ReadingComprehension'
123
127
  CUSTOM = 'Custom'
124
128
  INSTRUCTION_FOLLOWING = 'InstructionFollowing'
@@ -127,3 +131,17 @@ class Tags:
127
131
  RETRIEVAL = 'Retrieval'
128
132
  FUNCTION_CALLING = 'FunctionCalling'
129
133
  TEXT_TO_IMAGE = 'TextToImage'
134
+ IMAGE_EDITING = 'ImageEditing'
135
+ MULTI_MODAL = 'MultiModal'
136
+ MULTI_LINGUAL = 'MultiLingual'
137
+ MULTI_TURN = 'MultiTurn'
138
+ YES_NO = 'Yes/No'
139
+ HALLUCINATION = 'Hallucination'
140
+ MEDICAL = 'Medical'
141
+ AGENT = 'Agent'
142
+ MT = 'MachineTranslation'
143
+
144
+
145
+ class FileConstants:
146
+ IMAGE_PATH = 'image_path'
147
+ ID = 'id'