evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,96 +0,0 @@
1
- from dotenv import dotenv_values
2
-
3
- env = dotenv_values('.env')
4
- import json
5
- import os
6
- import unittest
7
-
8
- from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
9
- from evalscope.constants import EvalType, JudgeStrategy
10
- from evalscope.utils.io_utils import dump_jsonl_data
11
- from tests.utils import test_level_list
12
-
13
-
14
- class TestCollection(unittest.TestCase):
15
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
16
- def test_create_collection(self):
17
- schema = CollectionSchema(name='math&reasoning', datasets=[
18
- CollectionSchema(name='math', datasets=[
19
- CollectionSchema(name='generation', datasets=[
20
- DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
21
- ]),
22
- CollectionSchema(name='multiple_choice', datasets=[
23
- DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
24
- DatasetInfo(name='ceval', weight=3, task_type='math', tags=['zh', 'math'], args={'subset_list': ['advanced_mathematics', 'high_school_mathematics', 'discrete_mathematics', 'middle_school_mathematics']}),
25
- ]),
26
- ]),
27
- CollectionSchema(name='reasoning', datasets=[
28
- DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
29
- DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh', 'reasoning'], args={'subset_list': ['logic']}),
30
- DatasetInfo(name='race', weight=1, task_type='reasoning', tags=['en', 'reasoning']),
31
- ]),
32
- ])
33
- print(schema.to_dict())
34
- print(schema.flatten())
35
- schema.dump_json('outputs/schema_test.json')
36
-
37
-
38
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
39
- def test_generate_data(self):
40
- schema = CollectionSchema.from_dict(json.load(open('outputs/schema_test.json', 'r')))
41
- print(schema.to_dict())
42
- mixed_data = WeightedSampler(schema).sample(100)
43
- dump_jsonl_data(mixed_data, 'outputs/mixed_data_test.jsonl')
44
-
45
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
46
- def test_evaluate_collection(self):
47
- from evalscope import TaskConfig, run_task
48
-
49
- task_cfg = TaskConfig(
50
- model='qwen-plus',
51
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
52
- api_key=env.get('DASHSCOPE_API_KEY'),
53
- eval_type=EvalType.SERVICE,
54
- datasets=['data_collection'],
55
- dataset_args={'data_collection': {
56
- 'local_path': 'outputs/mixed_data_test.jsonl'
57
- # 'local_path': 'outputs/weighted_mixed_data.jsonl'
58
- }},
59
- eval_batch_size=5,
60
- generation_config = {
61
- 'max_tokens': 10000,
62
- 'temperature': 0.0,
63
- },
64
- limit=50,
65
- # use_cache='outputs/20250822_161804'
66
- )
67
- run_task(task_cfg=task_cfg)
68
-
69
-
70
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
71
- def test_evaluate_collection_with_judge(self):
72
- from evalscope import TaskConfig, run_task
73
-
74
- task_cfg = TaskConfig(
75
- model='qwen2.5-7b-instruct',
76
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
77
- api_key= os.getenv('DASHSCOPE_API_KEY'),
78
- eval_type=EvalType.SERVICE,
79
- datasets=['data_collection'],
80
- dataset_args={'data_collection': {
81
- 'local_path': 'outputs/mixed_data_test.jsonl'
82
- # 'local_path': 'outputs/weighted_mixed_data.jsonl'
83
- }},
84
- limit=5,
85
- judge_strategy=JudgeStrategy.AUTO,
86
- judge_model_args={
87
- 'model_id': 'qwen2.5-72b-instruct',
88
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
89
- 'api_key': os.getenv('DASHSCOPE_API_KEY'),
90
- },
91
- analysis_report=True,
92
- ignore_errors=True,
93
- # use_cache='outputs/20250522_204520'
94
- )
95
- res = run_task(task_cfg=task_cfg)
96
- print(res)
tests/cli/test_custom.py DELETED
@@ -1,268 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from dotenv import dotenv_values
3
-
4
- from tests.utils import test_level_list
5
-
6
- env = dotenv_values('.env')
7
-
8
- import os
9
- import subprocess
10
- import unittest
11
-
12
- from evalscope.config import TaskConfig
13
- from evalscope.constants import EvalType, JudgeStrategy, OutputType
14
- from evalscope.run import run_task
15
- from evalscope.utils.import_utils import is_module_installed
16
- from evalscope.utils.logger import get_logger
17
-
18
- os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
19
-
20
- logger = get_logger()
21
-
22
-
23
- class TestRunCustom(unittest.TestCase):
24
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
25
- def test_run_custom_task(self):
26
- from evalscope.config import TaskConfig
27
-
28
- task_cfg = TaskConfig(
29
- model='Qwen/Qwen3-0.6B',
30
- datasets=[
31
- 'general_mcq',
32
- 'general_qa'
33
- ],
34
- dataset_args={
35
- 'general_mcq': {
36
- 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
37
- 'subset_list': [
38
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
39
- ],
40
- 'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
41
- },
42
- 'general_qa': {
43
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
44
- 'subset_list': [
45
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
46
- ]
47
- }
48
- },
49
- )
50
- res = run_task(task_cfg=task_cfg)
51
- print(res)
52
-
53
-
54
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
55
- def test_run_local_dataset(self):
56
- from evalscope.config import TaskConfig
57
-
58
- task_cfg = TaskConfig(
59
- model='qwen-plus',
60
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
61
- api_key= env.get('DASHSCOPE_API_KEY'),
62
- eval_type=EvalType.SERVICE,
63
- datasets=[
64
- # 'mmlu',
65
- # 'race',
66
- 'trivia_qa',
67
- # 'cmmlu',
68
- # 'humaneval',
69
- # 'gsm8k',
70
- # 'bbh',
71
- # 'competition_math',
72
- # 'arc',
73
- # 'ceval',
74
- ],
75
- dataset_args={
76
- 'mmlu': {
77
- 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
78
- 'few_shot_num': 0,
79
- 'dataset_id': 'data/data/mmlu',
80
- },
81
- 'ceval': {
82
- 'subset_list': [
83
- 'computer_network', 'operating_system', 'computer_architecture'
84
- ],
85
- 'few_shot_num': 0,
86
- 'dataset_id': 'data/data/ceval',
87
- },
88
- 'cmmlu': {
89
- 'subset_list': ['elementary_chinese'],
90
- 'dataset_id': 'data/data/cmmlu',
91
- 'few_shot_num': 0
92
- },
93
- 'bbh': {
94
- 'subset_list': ['word_sorting', 'movie_recommendation'],
95
- },
96
- 'humaneval': {
97
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
98
- },
99
- 'trivia_qa': {
100
- 'dataset_id': 'data/data/trivia_qa',
101
- },
102
- },
103
- eval_batch_size=10,
104
- limit=5,
105
- debug=True,
106
- stream=True,
107
- generation_config={
108
- 'temperature': 0,
109
- 'n': 1,
110
- 'max_tokens': 4096,
111
- },
112
- ignore_errors=False,
113
- )
114
-
115
- run_task(task_cfg=task_cfg)
116
-
117
-
118
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
119
- def test_run_general_no_answer(self):
120
- from evalscope.config import TaskConfig
121
-
122
- task_cfg = TaskConfig(
123
- model='qwen2.5-7b-instruct',
124
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
125
- api_key= env.get('DASHSCOPE_API_KEY'),
126
- eval_type=EvalType.SERVICE,
127
- datasets=[
128
- 'general_qa',
129
- ],
130
- dataset_args={
131
- 'general_qa': {
132
- 'dataset_id': 'custom_eval/text/qa',
133
- 'subset_list': [
134
- 'arena',
135
- # 'example'
136
- ],
137
- }
138
- },
139
- eval_batch_size=10,
140
- limit=10,
141
- debug=True,
142
- stream=True,
143
- generation_config={
144
- 'temperature': 0,
145
- 'n': 1,
146
- 'max_tokens': 4096,
147
- },
148
- ignore_errors=False,
149
- judge_model_args={
150
- 'model_id': 'qwen2.5-7b-instruct',
151
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
- 'api_key': env.get('DASHSCOPE_API_KEY'),
153
- 'generation_config': {
154
- 'temperature': 0.0,
155
- 'max_tokens': 4096
156
- },
157
- 'score_type': 'numeric',
158
- 'prompt_template': """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
159
- Begin your evaluation by providing a short explanation. Be as objective as possible.
160
- After providing your explanation, you must rate the response on a scale of 0 (worst) to 100 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\"
161
-
162
- [Question]
163
- {question}
164
-
165
- [Response]
166
- {pred}
167
- """
168
- },
169
- judge_worker_num=5,
170
- judge_strategy=JudgeStrategy.LLM,
171
- )
172
-
173
- run_task(task_cfg=task_cfg)
174
-
175
-
176
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
177
- def test_run_general_with_answer(self):
178
- from evalscope.config import TaskConfig
179
-
180
- task_cfg = TaskConfig(
181
- model='qwen-plus',
182
- api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
183
- api_key= env.get('DASHSCOPE_API_KEY'),
184
- eval_type=EvalType.SERVICE,
185
- datasets=[
186
- 'general_qa',
187
- ],
188
- dataset_args={
189
- 'general_qa': {
190
- 'dataset_id': 'custom_eval/text/qa',
191
- 'subset_list': [
192
- 'example'
193
- ],
194
- }
195
- },
196
- eval_batch_size=10,
197
- limit=10,
198
- debug=True,
199
- stream=True,
200
- generation_config={
201
- 'temperature': 0,
202
- 'n': 1,
203
- 'max_tokens': 4096,
204
- },
205
- ignore_errors=False,
206
- judge_model_args={
207
- 'model_id': 'qwen2.5-72b-instruct',
208
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
209
- 'api_key': env.get('DASHSCOPE_API_KEY'),
210
- 'generation_config': {
211
- 'temperature': 0.0,
212
- 'max_tokens': 4096
213
- },
214
- 'score_type': 'pattern',
215
- },
216
- judge_worker_num=1,
217
- judge_strategy=JudgeStrategy.LLM_RECALL,
218
- use_cache='outputs/20250818_170420'
219
- )
220
-
221
- run_task(task_cfg=task_cfg)
222
-
223
-
224
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
225
- def test_run_general_arena(self):
226
- from evalscope.config import TaskConfig
227
-
228
- task_cfg = TaskConfig(
229
- model_id='Arena',
230
- datasets=[
231
- 'general_arena',
232
- ],
233
- dataset_args={
234
- 'general_arena': {
235
- 'extra_params':{
236
- 'models':[
237
- {
238
- 'name': 'qwen2.5-7b',
239
- 'report_path': 'outputs/20250819_165034/reports/qwen2.5-7b-instruct'
240
- },
241
- {
242
- 'name': 'qwen2.5-72b',
243
- 'report_path': 'outputs/20250819_164926/reports/qwen2.5-72b-instruct'
244
- }
245
- ],
246
- 'baseline': 'qwen2.5-72b'
247
- }
248
- }
249
- },
250
- eval_batch_size=10,
251
- limit=10,
252
- debug=True,
253
- stream=True,
254
- ignore_errors=False,
255
- judge_model_args={
256
- 'model_id': 'qwen-plus',
257
- 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
258
- 'api_key': env.get('DASHSCOPE_API_KEY'),
259
- 'generation_config': {
260
- 'temperature': 0.0,
261
- 'max_tokens': 8000
262
- },
263
- },
264
- judge_worker_num=5,
265
- # use_cache='outputs/20250819_173546'
266
- )
267
-
268
- run_task(task_cfg=task_cfg)
tests/perf/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
tests/perf/test_perf.py DELETED
@@ -1,176 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
- from dotenv import dotenv_values
4
-
5
- env = dotenv_values('.env')
6
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
- import unittest
8
-
9
- from evalscope.perf.main import run_perf_benchmark
10
- from tests.utils import test_level_list
11
-
12
-
13
- class TestPerf(unittest.TestCase):
14
-
15
- def setUp(self) -> None:
16
- pass
17
-
18
- def tearDown(self) -> None:
19
- pass
20
-
21
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
22
- def test_run_perf(self):
23
- task_cfg = {
24
- 'url': 'http://127.0.0.1:8001/v1/chat/completions',
25
- 'parallel': 1,
26
- 'model': 'qwen2.5',
27
- 'number': 15,
28
- 'api': 'openai',
29
- 'dataset': 'openqa',
30
- # 'stream': True,
31
- 'debug': True,
32
- }
33
- run_perf_benchmark(task_cfg)
34
-
35
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
36
- def test_run_perf_stream(self):
37
- task_cfg = {
38
- 'url': 'http://127.0.0.1:8801/v1/chat/completions',
39
- 'parallel': 1,
40
- 'model': 'Qwen2.5-0.5B-Instruct',
41
- 'number': 15,
42
- 'api': 'openai',
43
- 'dataset': 'openqa',
44
- 'stream': True,
45
- 'debug': True,
46
- }
47
- run_perf_benchmark(task_cfg)
48
-
49
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
50
- def test_run_perf_speed_benchmark(self):
51
- task_cfg = {
52
- 'url': 'http://127.0.0.1:8001/v1/completions',
53
- 'parallel': 1,
54
- 'model': 'qwen2.5',
55
- 'api': 'openai',
56
- 'dataset': 'speed_benchmark',
57
- 'min_tokens': 2048,
58
- 'max_tokens': 2048,
59
- 'debug': True,
60
- }
61
- run_perf_benchmark(task_cfg)
62
-
63
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
64
- def test_run_perf_local(self):
65
- task_cfg = {
66
- 'parallel': 1,
67
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
68
- 'number': 5,
69
- 'api': 'local',
70
- 'dataset': 'openqa',
71
- 'debug': True,
72
- }
73
- run_perf_benchmark(task_cfg)
74
-
75
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
76
- def test_run_perf_local_stream(self):
77
- task_cfg = {
78
- 'parallel': 1,
79
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
80
- 'number': 5,
81
- 'api': 'local',
82
- 'dataset': 'openqa',
83
- 'stream': True,
84
- 'debug': True,
85
- }
86
- run_perf_benchmark(task_cfg)
87
-
88
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
89
- def test_run_perf_local_speed_benchmark(self):
90
- task_cfg = {
91
- 'parallel': 1,
92
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
93
- 'api': 'local_vllm',
94
- 'dataset': 'speed_benchmark',
95
- 'min_tokens': 2048,
96
- 'max_tokens': 2048,
97
- 'debug': True,
98
- }
99
- run_perf_benchmark(task_cfg)
100
-
101
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
102
- def test_run_perf_local_random(self):
103
- from evalscope.perf.arguments import Arguments
104
- task_cfg = Arguments(
105
- parallel=20,
106
- model='Qwen3-1.7B',
107
- url='http://127.0.0.1:8801/v1/completions',
108
- api='openai',
109
- dataset='random',
110
- min_tokens=1024,
111
- max_tokens=1024,
112
- prefix_length=0,
113
- min_prompt_length=1024,
114
- max_prompt_length=1024,
115
- number=20,
116
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
117
- seed=None,
118
- extra_args={'ignore_eos': True}
119
- )
120
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
121
- print(metrics_result)
122
- print(percentile_result)
123
-
124
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
125
- def test_run_perf_multi_parallel(self):
126
- from evalscope.perf.arguments import Arguments
127
- task_cfg = Arguments(
128
- parallel=[1, 2],
129
- number=[2, 4],
130
- model='qwen2.5-7b-instruct',
131
- url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
132
- api_key=env.get('DASHSCOPE_API_KEY'),
133
- api='openai',
134
- dataset='random',
135
- min_tokens=100,
136
- max_tokens=100,
137
- prefix_length=0,
138
- min_prompt_length=1024,
139
- max_prompt_length=1024,
140
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
141
- seed=None,
142
- extra_args={'ignore_eos': True}
143
- )
144
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
145
- print(metrics_result)
146
- print(percentile_result)
147
-
148
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
149
- def test_run_perf_random_vl(self):
150
- from evalscope.perf.arguments import Arguments
151
- task_cfg = Arguments(
152
- parallel=[1, 2],
153
- number=[2, 4],
154
- model='qwen-vl-max',
155
- url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
156
- api_key=env.get('DASHSCOPE_API_KEY'),
157
- api='openai',
158
- dataset='kontext_bench',
159
- min_tokens=100,
160
- max_tokens=100,
161
- prefix_length=0,
162
- min_prompt_length=100,
163
- max_prompt_length=100,
164
- image_height=512,
165
- image_width=512,
166
- image_num=2,
167
- tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
168
- seed=None,
169
- extra_args={'ignore_eos': True}
170
- )
171
- metrics_result, percentile_result = run_perf_benchmark(task_cfg)
172
- print(metrics_result)
173
- print(percentile_result)
174
-
175
- if __name__ == '__main__':
176
- unittest.main(buffer=False)
@@ -1,90 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import os
4
-
5
- # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
6
- import subprocess
7
- import unittest
8
-
9
- from evalscope.run import run_task
10
- from evalscope.utils.import_utils import is_module_installed
11
- from evalscope.utils.logger import get_logger
12
- from tests.utils import test_level_list
13
-
14
- logger = get_logger()
15
-
16
-
17
- class TestCLIPBenchmark(unittest.TestCase):
18
-
19
- def setUp(self) -> None:
20
- self._check_env('webdataset')
21
-
22
- def tearDown(self) -> None:
23
- pass
24
-
25
- @staticmethod
26
- def _check_env(module_name: str):
27
- if is_module_installed(module_name):
28
- logger.info(f'{module_name} is installed.')
29
- else:
30
- raise ModuleNotFoundError(f'run: pip install {module_name}')
31
-
32
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
33
- def test_run_task(self):
34
- task_cfg = {
35
- 'eval_backend': 'RAGEval',
36
- 'eval_config': {
37
- 'tool': 'clip_benchmark',
38
- 'eval': {
39
- 'models': [
40
- {
41
- 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
42
- }
43
- ],
44
- 'dataset_name': [
45
- 'muge',
46
- 'mnist',
47
- 'flickr8k'
48
- ],
49
- 'split': 'test',
50
- 'batch_size': 128,
51
- 'num_workers': 1,
52
- 'verbose': True,
53
- 'skip_existing': False,
54
- 'cache_dir': 'cache',
55
- 'limit': 1000,
56
- },
57
- },
58
- }
59
-
60
- run_task(task_cfg)
61
-
62
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
63
- def test_run_custom(self):
64
- task_cfg = {
65
- 'eval_backend': 'RAGEval',
66
- 'eval_config': {
67
- 'tool': 'clip_benchmark',
68
- 'eval': {
69
- 'models': [
70
- {
71
- 'model_name': 'AI-ModelScope/chinese-clip-vit-large-patch14-336px',
72
- }
73
- ],
74
- 'dataset_name': ['custom'],
75
- 'data_dir': 'custom_eval/multimodal/text-image-retrieval',
76
- 'split': 'test',
77
- 'batch_size': 128,
78
- 'num_workers': 1,
79
- 'verbose': True,
80
- 'skip_existing': False,
81
- 'limit': 1000,
82
- },
83
- },
84
- }
85
-
86
- run_task(task_cfg)
87
-
88
-
89
- if __name__ == '__main__':
90
- unittest.main(buffer=False)