evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/arguments.py CHANGED
@@ -1,7 +1,8 @@
1
+ # flake8: noqa: E501
1
2
  import argparse
2
3
  import json
3
4
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType
5
+ from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
5
6
 
6
7
 
7
8
  class ParseStrArgsAction(argparse.Action):
@@ -9,6 +10,15 @@ class ParseStrArgsAction(argparse.Action):
9
10
  def __call__(self, parser, namespace, values, option_string=None):
10
11
  assert isinstance(values, str), 'args should be a string.'
11
12
 
13
+ # try json load first
14
+ try:
15
+ arg_dict = json.loads(values)
16
+ setattr(namespace, self.dest, arg_dict)
17
+ return
18
+ except (json.JSONDecodeError, ValueError):
19
+ pass
20
+
21
+ # If JSON load fails, fall back to parsing as key=value pairs
12
22
  arg_dict = {}
13
23
  for arg in values.strip().split(','):
14
24
  key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
@@ -35,9 +45,9 @@ def add_argument(parser: argparse.ArgumentParser):
35
45
  parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
36
46
  parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
37
47
  parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
48
+ parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
38
49
 
39
50
  # Template-related arguments
40
- parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
41
51
  parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
42
52
 
43
53
  # Dataset-related arguments
@@ -50,26 +60,39 @@ def add_argument(parser: argparse.ArgumentParser):
50
60
  parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
51
61
 
52
62
  # Evaluation-related arguments
53
- parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
54
- choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
63
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
55
64
  parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
56
65
  choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
57
66
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
58
- parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
59
- choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
60
- parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
67
+ parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
68
+ parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
69
+ parser.add_argument('--repeats', type=int, default=1, help='Number of times to repeat the dataset items for k-metrics.') # noqa: E501
61
70
 
62
71
  # Cache and working directory arguments
63
- parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
64
72
  parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
73
+ parser.add_argument('--rerun-review', action='store_true', default=False, help='Rerun the review process when use_cache.')
65
74
  parser.add_argument('--work-dir', type=str, help='The root cache dir.')
66
75
 
67
76
  # Debug and runtime mode arguments
77
+ parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
68
78
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
69
- parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
70
79
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
71
80
  parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
72
81
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
82
+ parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
83
+ parser.add_argument('--stream', action='store_true', default=None, help='Stream mode.') # noqa: E501
84
+
85
+ # LLMJudge arguments
86
+ parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
87
+ parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
88
+ parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
89
+ parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
90
+
91
+ # Sandbox-related arguments
92
+ parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.') # noqa: E501
93
+ parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.') # noqa: E501
94
+ parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.') # noqa: E501
95
+ parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.') # noqa: E501
73
96
  # yapf: enable
74
97
 
75
98
 
@@ -49,7 +49,8 @@ register_template(
49
49
  reserved_roles=[
50
50
  dict(role='SYSTEM', api_role='SYSTEM'),
51
51
  ],
52
- ))
52
+ )
53
+ )
53
54
 
54
55
  if __name__ == '__main__':
55
56
  res = MetaTemplateType.get_template_name_list()
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
2
3
  import subprocess
3
4
  import tempfile
4
5
  from dataclasses import asdict
@@ -7,7 +8,8 @@ from typing import Optional, Union
7
8
 
8
9
  from evalscope.backend.base import BackendManager
9
10
  from evalscope.backend.opencompass.api_meta_template import get_template
10
- from evalscope.utils import get_module_path, get_valid_list, is_module_installed
11
+ from evalscope.utils.import_utils import get_module_path, is_module_installed
12
+ from evalscope.utils.io_utils import get_valid_list
11
13
  from evalscope.utils.logger import get_logger
12
14
 
13
15
  logger = get_logger()
@@ -45,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
45
47
  datasets: list, the datasets.
46
48
  models: list, the models.
47
49
  work_dir (Optional): str, the working directory. Default to None, which means the current directory.
48
- dry_run (Optional): bool, the dry-run flag. Default to False.
49
50
  debug (Optional): bool, the debug flag. Default to False.
50
51
  reuse (Optional): str, reuse previous outputs & results. Default to None.
51
52
  generation_kwargs (Optional): dict, the generation config. Default to {}.
@@ -138,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
138
139
  cmd_str = f'python -m run_oc ' \
139
140
  f'--models {" ".join(self.args.models)} ' \
140
141
  f'--datasets {" ".join(self.args.datasets)} ' \
141
- f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
142
142
  f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
143
143
 
144
144
  elif cmd_mode == CmdMode.SCRIPT:
@@ -180,8 +180,10 @@ class OpenCompassBackendManager(BackendManager):
180
180
  else:
181
181
  valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
182
182
  if len(invalid_dataset_names) > 0:
183
- logger.error(f'Invalid datasets: {invalid_dataset_names}, '
184
- f'refer to the following list to get proper dataset name: {dataset_names_all}')
183
+ logger.error(
184
+ f'Invalid datasets: {invalid_dataset_names}, '
185
+ f'refer to the following list to get proper dataset name: {dataset_names_all}'
186
+ )
185
187
  assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
186
188
  f'To get the valid datasets, please refer to {dataset_names_all}'
187
189
 
@@ -204,7 +206,7 @@ class OpenCompassBackendManager(BackendManager):
204
206
  model_d['meta_template'] = get_template(model_d['meta_template'])
205
207
 
206
208
  # set the 'abbr' as the 'path' if 'abbr' is not specified
207
- model_d['abbr'] = model_d['path']
209
+ model_d['abbr'] = os.path.basename(model_d['path'])
208
210
 
209
211
  model_config = ApiModelConfig(**model_d)
210
212
  models.append(asdict(model_config))
@@ -250,7 +252,8 @@ if __name__ == '__main__':
250
252
  'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
251
253
  }],
252
254
  'limit': 5
253
- })
255
+ }
256
+ )
254
257
  all_datasets = OpenCompassBackendManager.list_datasets()
255
258
  print(f'all_datasets: {all_datasets}')
256
259
  oc_backend_manager.run()
@@ -1,4 +1,4 @@
1
- from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
1
+ from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
2
2
  from evalscope.backend.rag_eval.utils.clip import VisionModel
3
3
  from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
4
4
  from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
@@ -2,12 +2,19 @@ import os
2
2
  from typing import Optional, Union
3
3
 
4
4
  from evalscope.backend.base import BackendManager
5
- from evalscope.utils import get_valid_list, is_module_installed
5
+ from evalscope.utils.import_utils import is_module_installed
6
+ from evalscope.utils.io_utils import get_valid_list
6
7
  from evalscope.utils.logger import get_logger
7
8
 
8
9
  logger = get_logger()
9
10
 
10
11
 
12
+ class Tools:
13
+ MTEB = 'mteb'
14
+ RAGAS = 'ragas'
15
+ CLIP_BENCHMARK = 'clip_benchmark'
16
+
17
+
11
18
  class RAGEvalBackendManager(BackendManager):
12
19
 
13
20
  def __init__(self, config: Union[str, dict], **kwargs):
@@ -47,9 +54,19 @@ class RAGEvalBackendManager(BackendManager):
47
54
  from evalscope.backend.rag_eval.ragas.tasks import generate_testset
48
55
 
49
56
  if testset_args is not None:
50
- generate_testset(TestsetGenerationArguments(**testset_args))
57
+ if isinstance(testset_args, dict):
58
+ generate_testset(TestsetGenerationArguments(**testset_args))
59
+ elif isinstance(testset_args, TestsetGenerationArguments):
60
+ generate_testset(testset_args)
61
+ else:
62
+ raise ValueError('Please provide the testset generation arguments.')
51
63
  if eval_args is not None:
52
- rag_eval(EvaluationArguments(**eval_args))
64
+ if isinstance(eval_args, dict):
65
+ rag_eval(EvaluationArguments(**eval_args))
66
+ elif isinstance(eval_args, EvaluationArguments):
67
+ rag_eval(eval_args)
68
+ else:
69
+ raise ValueError('Please provide the evaluation arguments.')
53
70
 
54
71
  @staticmethod
55
72
  def run_clip_benchmark(args):
@@ -59,17 +76,17 @@ class RAGEvalBackendManager(BackendManager):
59
76
 
60
77
  def run(self, *args, **kwargs):
61
78
  tool = self.config_d.pop('tool')
62
- if tool.lower() == 'mteb':
79
+ if tool.lower() == Tools.MTEB:
63
80
  self._check_env('mteb')
64
81
  model_args = self.config_d['model']
65
82
  eval_args = self.config_d['eval']
66
83
  self.run_mteb(model_args, eval_args)
67
- elif tool.lower() == 'ragas':
84
+ elif tool.lower() == Tools.RAGAS:
68
85
  self._check_env('ragas')
69
86
  testset_args = self.config_d.get('testset_generation', None)
70
87
  eval_args = self.config_d.get('eval', None)
71
88
  self.run_ragas(testset_args, eval_args)
72
- elif tool.lower() == 'clip_benchmark':
89
+ elif tool.lower() == Tools.CLIP_BENCHMARK:
73
90
  self._check_env('webdataset')
74
91
  self.run_clip_benchmark(self.config_d['eval'])
75
92
  else:
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import posixpath # For URL path handling
2
3
  import torch
3
4
  from torch.utils.data import DataLoader
4
5
  from torch.utils.data import Dataset as TorchDataset
@@ -99,16 +100,16 @@ class DatasetWrapper(TorchDataset):
99
100
 
100
101
  def get_dataset_default_task(dataset):
101
102
  if dataset in (
102
- 'custom',
103
- 'muge',
104
- 'flickr30k',
105
- 'flickr8k',
106
- 'mscoco_captions',
107
- 'mscoco_captions2017',
108
- 'multilingual_mscoco_captions',
109
- 'flickr30k-200',
110
- 'crossmodal3600',
111
- 'xtd200',
103
+ 'custom',
104
+ 'muge',
105
+ 'flickr30k',
106
+ 'flickr8k',
107
+ 'mscoco_captions',
108
+ 'mscoco_captions2017',
109
+ 'multilingual_mscoco_captions',
110
+ 'flickr30k-200',
111
+ 'crossmodal3600',
112
+ 'xtd200',
112
113
  ):
113
114
  return 'zeroshot_retrieval'
114
115
  else:
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
186
187
 
187
188
  Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
188
189
  """
190
+ import requests
189
191
  import webdataset as wds
190
192
 
191
193
  def read_txt(fname):
192
- if '://' in fname:
193
- stream = os.popen("curl -L -s --fail '%s'" % fname, 'r')
194
- value = stream.read()
195
- if stream.close():
196
- raise FileNotFoundError('Failed to retreive data')
194
+ if fname.startswith(('http://', 'https://')):
195
+ try:
196
+ response = requests.get(fname)
197
+ response.raise_for_status() # Ensure the HTTP request was successful
198
+ return response.text
199
+ except requests.exceptions.RequestException as e:
200
+ raise FileNotFoundError(f'Failed to read {fname}: {e}')
197
201
  else:
198
202
  with open(fname, 'r') as file:
199
- value = file.read()
200
- return value
203
+ return file.read()
204
+
205
+ def url_path_join(*parts):
206
+ """Join URL path parts with forward slashes regardless of platform"""
207
+ return posixpath.join(*parts)
201
208
 
202
209
  if not data_dir:
203
210
  data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
204
211
 
205
212
  # Git LFS files have a different file path to access the raw data than other files
206
- if data_dir.startswith('https://modelscope.cn/datasets'):
213
+ is_url = data_dir.startswith(('http://', 'https://'))
214
+ if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
207
215
  *split_url_head, _, url_path = data_dir.split('/', 7)
208
216
  url_head = '/'.join(split_url_head)
209
217
  metadata_dir = '/'.join([url_head, 'resolve', url_path])
210
218
  tardata_dir = '/'.join([url_head, 'resolve', url_path])
211
219
  else:
212
220
  metadata_dir = tardata_dir = data_dir
221
+
222
+ # Use appropriate path joining function based on whether we're dealing with a URL
223
+ path_join = url_path_join if is_url else os.path.join
224
+
213
225
  # Get number of shards
214
- nshards_fname = os.path.join(metadata_dir, split, 'nshards.txt')
226
+ nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
215
227
  nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
216
228
 
217
229
  # Get dataset type (classification or retrieval)
218
- type_fname = os.path.join(metadata_dir, 'dataset_type.txt')
230
+ type_fname = path_join(metadata_dir, 'dataset_type.txt')
219
231
  try:
220
232
  dataset_type = read_txt(type_fname).strip().lower()
221
233
  except FileNotFoundError:
222
234
  dataset_type = 'classification'
223
235
 
224
- filepattern = os.path.join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
236
+ filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
225
237
  # Load webdataset (support WEBP, PNG, and JPG for now)
226
238
  if not cache_dir or not isinstance(cache_dir, str):
227
239
  cache_dir = None
@@ -4,8 +4,11 @@ import torch
4
4
  from itertools import product
5
5
 
6
6
  from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
7
- from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (build_dataset, get_dataloader,
8
- get_dataset_default_task)
7
+ from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
8
+ build_dataset,
9
+ get_dataloader,
10
+ get_dataset_default_task,
11
+ )
9
12
  from evalscope.backend.rag_eval.clip_benchmark.tasks import image_caption, zeroshot_classification, zeroshot_retrieval
10
13
  from evalscope.backend.rag_eval.utils.clip import VisionModel
11
14
  from evalscope.utils.logger import get_logger
@@ -66,8 +69,9 @@ def evaluate(args: Arguments):
66
69
  if verbose:
67
70
  logger.info(f'Zero-shot templates: {zeroshot_templates}')
68
71
  classnames = dataset.classes if hasattr(dataset, 'classes') else None
69
- assert (zeroshot_templates is not None
70
- and classnames is not None), 'Dataset does not support classification'
72
+ assert (
73
+ zeroshot_templates is not None and classnames is not None
74
+ ), 'Dataset does not support classification'
71
75
  metrics = zeroshot_classification.evaluate(
72
76
  model,
73
77
  dataloader,
@@ -11,7 +11,9 @@ class ModelArguments:
11
11
  pooling_mode: Optional[str] = None
12
12
  max_seq_length: int = 512 # max sequence length
13
13
  # prompt for llm based model
14
- prompt: str = ''
14
+ prompt: Optional[str] = None
15
+ # prompts dictionary for different tasks, if prompt is not set
16
+ prompts: Optional[Dict[str, str]] = None
15
17
  # model kwargs
16
18
  model_kwargs: dict = field(default_factory=dict)
17
19
  # config kwargs
@@ -20,6 +22,12 @@ class ModelArguments:
20
22
  encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
21
23
  hub: str = 'modelscope' # modelscope or huggingface
22
24
 
25
+ # for API embedding model
26
+ model_name: Optional[str] = None
27
+ api_base: Optional[str] = None
28
+ api_key: Optional[str] = None
29
+ dimensions: Optional[int] = None
30
+
23
31
  def to_dict(self) -> Dict[str, Any]:
24
32
  return {
25
33
  'model_name_or_path': self.model_name_or_path,
@@ -27,10 +35,15 @@ class ModelArguments:
27
35
  'pooling_mode': self.pooling_mode,
28
36
  'max_seq_length': self.max_seq_length,
29
37
  'prompt': self.prompt,
38
+ 'prompts': self.prompts,
30
39
  'model_kwargs': self.model_kwargs,
31
40
  'config_kwargs': self.config_kwargs,
32
41
  'encode_kwargs': self.encode_kwargs,
33
42
  'hub': self.hub,
43
+ 'model_name': self.model_name,
44
+ 'api_base': self.api_base,
45
+ 'api_key': self.api_key,
46
+ 'dimensions': self.dimensions,
34
47
  }
35
48
 
36
49
 
@@ -1,6 +1,6 @@
1
1
  import mteb
2
2
  import os
3
- from mteb.task_selection import results_to_dataframe
3
+ from tabulate import tabulate
4
4
 
5
5
  from evalscope.backend.rag_eval import EmbeddingModel, cmteb
6
6
  from evalscope.utils.logger import get_logger
@@ -12,14 +12,27 @@ def show_results(output_folder, model, results):
12
12
  model_name = model.mteb_model_meta.model_name_as_path()
13
13
  revision = model.mteb_model_meta.revision
14
14
 
15
- results_df = results_to_dataframe({model_name: {revision: results}})
15
+ data = []
16
+ for model_res in results:
17
+ main_res = model_res.only_main_score()
18
+ for split, score in main_res.scores.items():
19
+ for sub_score in score:
20
+ data.append({
21
+ 'Model': model_name.replace('eval__', ''),
22
+ 'Revision': revision,
23
+ 'Task Type': main_res.task_type,
24
+ 'Task': main_res.task_name,
25
+ 'Split': split,
26
+ 'Subset': sub_score['hf_subset'],
27
+ 'Main Score': sub_score['main_score'],
28
+ })
16
29
 
17
30
  save_path = os.path.join(
18
31
  output_folder,
19
32
  model_name,
20
33
  revision,
21
34
  )
22
- logger.info(f'Evaluation results:\n{results_df.to_markdown()}')
35
+ logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
23
36
  logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
24
37
 
25
38
 
@@ -34,6 +47,7 @@ def one_stage_eval(
34
47
  tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
35
48
  evaluation = mteb.MTEB(tasks=tasks)
36
49
 
50
+ eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
37
51
  # run evaluation
38
52
  results = evaluation.run(model, **eval_args)
39
53
 
@@ -66,6 +80,7 @@ def two_stage_eval(
66
80
  overwrite_results=True,
67
81
  hub=eval_args['hub'],
68
82
  limits=eval_args['limits'],
83
+ encode_kwargs=model1_args.get('encode_kwargs', {}),
69
84
  )
70
85
  # stage 2: run cross encoder
71
86
  results = evaluation.run(
@@ -77,6 +92,7 @@ def two_stage_eval(
77
92
  overwrite_results=True,
78
93
  hub=eval_args['hub'],
79
94
  limits=eval_args['limits'],
95
+ encode_kwargs=model2_args.get('encode_kwargs', {}),
80
96
  )
81
97
 
82
98
  # save and log results
@@ -9,7 +9,6 @@ class CustomRetrieval(AbsTaskRetrieval):
9
9
  ignore_identical_ids: bool = True
10
10
 
11
11
  def __init__(self, dataset_path: Optional[str] = 'custom_eval/text/retrieval', **kwargs):
12
- super().__init__(**kwargs)
13
12
  self.metadata = TaskMetadata(
14
13
  name='CustomRetrieval',
15
14
  description='CustomRetrieval Task',
@@ -34,6 +33,7 @@ class CustomRetrieval(AbsTaskRetrieval):
34
33
  bibtex_citation='',
35
34
  descriptive_stats={},
36
35
  )
36
+ super().__init__(**kwargs)
37
37
 
38
38
  def load_data(self, **kwargs):
39
39
  if self.data_loaded:
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
21
21
  """
22
22
  generator_llm: Dict = field(default_factory=dict)
23
23
  embeddings: Dict = field(default_factory=dict)
24
- distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
25
24
  # For LLM based evaluation
26
25
  # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
27
26
  # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -34,7 +34,8 @@ def rag_eval(args: EvaluationArguments, ) -> None:
34
34
  target_lang=args.language,
35
35
  llm=LangchainLLMWrapper(llm),
36
36
  adapt_instruction=True,
37
- ))
37
+ )
38
+ )
38
39
  # load dataset
39
40
  dataset = Dataset.from_json(args.testset_file)
40
41
 
@@ -27,7 +27,8 @@ def default_query_distribution(llm: BaseRagasLLM, kg: KnowledgeGraph, language:
27
27
  target_lang=language,
28
28
  llm=llm,
29
29
  adapt_instruction=True,
30
- ))
30
+ )
31
+ )
31
32
 
32
33
  default_queries = [
33
34
  single_hop,
@@ -44,8 +44,9 @@ def default_transforms(
44
44
  return bins
45
45
 
46
46
  def filter_doc_with_num_tokens(node, min_num_tokens=500):
47
- return (node.type == NodeType.DOCUMENT
48
- and num_tokens_from_string(node.properties['page_content']) > min_num_tokens)
47
+ return (
48
+ node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > min_num_tokens
49
+ )
49
50
 
50
51
  def filter_docs(node):
51
52
  return node.type == NodeType.DOCUMENT
@@ -90,7 +91,8 @@ def default_transforms(
90
91
  target_lang=language,
91
92
  llm=llm,
92
93
  adapt_instruction=True,
93
- ))
94
+ )
95
+ )
94
96
 
95
97
  transforms = [
96
98
  headline_extractor,
@@ -121,7 +123,8 @@ def default_transforms(
121
123
  target_lang=language,
122
124
  llm=llm,
123
125
  adapt_instruction=True,
124
- ))
126
+ )
127
+ )
125
128
 
126
129
  transforms = [
127
130
  summary_extractor,
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
67
67
 
68
68
 
69
69
  def load_data(file_path):
70
- from langchain_community.document_loaders import UnstructuredFileLoader
70
+ import nltk
71
+ from langchain_unstructured import UnstructuredLoader
71
72
 
72
- loader = UnstructuredFileLoader(file_path, mode='single')
73
+ if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
74
+ # need to download nltk data for the first time
75
+ nltk.download('averaged_perceptron_tagger_eng')
76
+
77
+ loader = UnstructuredLoader(file_path)
73
78
  data = loader.load()
74
79
  return data
75
80
 
@@ -108,7 +113,8 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
108
113
 
109
114
  # generate testset
110
115
  generator = TestsetGenerator(
111
- llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list)
116
+ llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list
117
+ )
112
118
 
113
119
  testset = generator.generate(
114
120
  testset_size=args.test_size,
@@ -2,7 +2,6 @@ import asyncio
2
2
  import os
3
3
  from ragas.llms import BaseRagasLLM
4
4
  from ragas.prompt import PromptMixin, PydanticPrompt
5
- from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
6
5
  from typing import List
7
6
 
8
7
  from evalscope.utils.logger import get_logger
@@ -16,10 +15,6 @@ async def translate_prompt(
16
15
  llm: BaseRagasLLM,
17
16
  adapt_instruction: bool = False,
18
17
  ):
19
- if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
20
- logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
21
- return
22
-
23
18
  if not issubclass(type(prompt_user), PromptMixin):
24
19
  logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
25
20
  return
@@ -39,7 +34,8 @@ async def translate_prompt(
39
34
 
40
35
  logger.info(f'Translating prompts to {target_lang}')
41
36
  adapted_prompts = await prompt_user.adapt_prompts(
42
- language=target_lang, llm=llm, adapt_instruction=adapt_instruction)
37
+ language=target_lang, llm=llm, adapt_instruction=adapt_instruction
38
+ )
43
39
  prompt_user.set_prompts(**adapted_prompts)
44
40
  try:
45
41
  prompt_user.save_prompts(prompt_dir)