evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (606) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +11 -0
  3. evalscope/api/benchmark/adapters/__init__.py +7 -0
  4. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
  6. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  7. evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
  8. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  9. evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
  10. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  11. evalscope/api/benchmark/benchmark.py +404 -0
  12. evalscope/api/benchmark/meta.py +124 -0
  13. evalscope/api/dataset/__init__.py +2 -0
  14. evalscope/api/dataset/dataset.py +370 -0
  15. evalscope/api/dataset/loader.py +266 -0
  16. evalscope/api/dataset/utils.py +143 -0
  17. evalscope/api/evaluator/__init__.py +3 -0
  18. evalscope/api/evaluator/cache.py +382 -0
  19. evalscope/api/evaluator/evaluator.py +61 -0
  20. evalscope/api/evaluator/state.py +280 -0
  21. evalscope/api/filter/__init__.py +1 -0
  22. evalscope/api/filter/filter.py +72 -0
  23. evalscope/api/messages/__init__.py +12 -0
  24. evalscope/api/messages/chat_message.py +248 -0
  25. evalscope/api/messages/content.py +102 -0
  26. evalscope/api/messages/utils.py +35 -0
  27. evalscope/api/metric/__init__.py +2 -0
  28. evalscope/api/metric/metric.py +60 -0
  29. evalscope/api/metric/scorer.py +113 -0
  30. evalscope/api/mixin/__init__.py +2 -0
  31. evalscope/api/mixin/llm_judge_mixin.py +170 -0
  32. evalscope/api/mixin/sandbox_mixin.py +182 -0
  33. evalscope/api/model/__init__.py +12 -0
  34. evalscope/api/model/generate_config.py +161 -0
  35. evalscope/api/model/model.py +386 -0
  36. evalscope/api/model/model_output.py +285 -0
  37. evalscope/api/registry.py +182 -0
  38. evalscope/api/tool/__init__.py +3 -0
  39. evalscope/api/tool/tool_call.py +101 -0
  40. evalscope/api/tool/tool_info.py +173 -0
  41. evalscope/api/tool/utils.py +64 -0
  42. evalscope/app/__init__.py +28 -0
  43. evalscope/app/app.py +38 -0
  44. evalscope/app/arguments.py +11 -0
  45. evalscope/app/constants.py +22 -0
  46. evalscope/app/ui/__init__.py +20 -0
  47. evalscope/app/ui/app_ui.py +53 -0
  48. evalscope/app/ui/multi_model.py +353 -0
  49. evalscope/app/ui/sidebar.py +42 -0
  50. evalscope/app/ui/single_model.py +220 -0
  51. evalscope/app/ui/visualization.py +36 -0
  52. evalscope/app/utils/data_utils.py +195 -0
  53. evalscope/app/utils/env_utils.py +12 -0
  54. evalscope/app/utils/localization.py +221 -0
  55. evalscope/app/utils/text_utils.py +119 -0
  56. evalscope/app/utils/visualization.py +96 -0
  57. evalscope/arguments.py +32 -9
  58. evalscope/backend/opencompass/api_meta_template.py +2 -1
  59. evalscope/backend/opencompass/backend_manager.py +10 -7
  60. evalscope/backend/rag_eval/__init__.py +1 -1
  61. evalscope/backend/rag_eval/backend_manager.py +23 -6
  62. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
  63. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  64. evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
  65. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  66. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  67. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  68. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  69. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  70. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  71. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
  72. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
  73. evalscope/backend/rag_eval/utils/embedding.py +125 -32
  74. evalscope/backend/rag_eval/utils/llm.py +16 -16
  75. evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
  76. evalscope/benchmarks/__init__.py +17 -5
  77. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  78. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  79. evalscope/benchmarks/ai2d/__init__.py +0 -0
  80. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  81. evalscope/benchmarks/aime/__init__.py +0 -0
  82. evalscope/benchmarks/aime/aime24_adapter.py +55 -0
  83. evalscope/benchmarks/aime/aime25_adapter.py +181 -0
  84. evalscope/benchmarks/aime/grader.py +307 -0
  85. evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
  86. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  87. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  88. evalscope/benchmarks/amc/__init__.py +0 -0
  89. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  90. evalscope/benchmarks/arc/arc_adapter.py +34 -149
  91. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  92. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
  93. evalscope/benchmarks/arena_hard/utils.py +186 -0
  94. evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
  95. evalscope/benchmarks/bfcl/__init__.py +0 -0
  96. evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
  97. evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
  98. evalscope/benchmarks/bfcl/v3/generation.py +222 -0
  99. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  100. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  101. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  102. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  103. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  104. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  105. evalscope/benchmarks/blink/__init__.py +0 -0
  106. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  107. evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
  108. evalscope/benchmarks/chartqa/__init__.py +0 -0
  109. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  110. evalscope/benchmarks/chartqa/utils.py +38 -0
  111. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  112. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
  113. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
  114. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  115. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  116. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  117. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  118. evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
  119. evalscope/benchmarks/data_collection/__init__.py +0 -0
  120. evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
  121. evalscope/benchmarks/docmath/__init__.py +0 -0
  122. evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  123. evalscope/benchmarks/docmath/utils.py +219 -0
  124. evalscope/benchmarks/docvqa/__init__.py +0 -0
  125. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  126. evalscope/benchmarks/drivelology/__init__.py +0 -0
  127. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  128. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  129. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  130. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  131. evalscope/benchmarks/drop/__init__.py +0 -0
  132. evalscope/benchmarks/drop/drop_adapter.py +155 -0
  133. evalscope/benchmarks/drop/utils.py +156 -0
  134. evalscope/benchmarks/frames/__init__.py +0 -0
  135. evalscope/benchmarks/frames/frames_adapter.py +175 -0
  136. evalscope/benchmarks/frames/utils.py +37 -0
  137. evalscope/benchmarks/general_arena/__init__.py +0 -0
  138. evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
  139. evalscope/benchmarks/general_arena/utils.py +223 -0
  140. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  141. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  142. evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
  143. evalscope/benchmarks/gpqa/__init__.py +0 -0
  144. evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  145. evalscope/benchmarks/gpqa/prompt.py +88 -0
  146. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
  147. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  148. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  149. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  150. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  151. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  152. evalscope/benchmarks/healthbench/__init__.py +0 -0
  153. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  154. evalscope/benchmarks/healthbench/utils.py +102 -0
  155. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
  156. evalscope/benchmarks/hle/__init__.py +0 -0
  157. evalscope/benchmarks/hle/hle_adapter.py +153 -0
  158. evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
  159. evalscope/benchmarks/humaneval/utils.py +235 -0
  160. evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
  161. evalscope/benchmarks/ifeval/instructions.py +112 -68
  162. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  163. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  164. evalscope/benchmarks/ifeval/utils.py +6 -7
  165. evalscope/benchmarks/image_edit/__init__.py +0 -0
  166. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  167. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  168. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  169. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  170. evalscope/benchmarks/infovqa/__init__.py +0 -0
  171. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  172. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
  173. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  174. evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
  175. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  176. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
  177. evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
  178. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  179. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  180. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  181. evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
  182. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  183. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  184. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  185. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  186. evalscope/benchmarks/math_500/__init__.py +0 -0
  187. evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
  188. evalscope/benchmarks/math_qa/__init__.py +0 -0
  189. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  190. evalscope/benchmarks/math_verse/__init__.py +0 -0
  191. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  192. evalscope/benchmarks/math_vision/__init__.py +0 -0
  193. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  194. evalscope/benchmarks/math_vista/__init__.py +0 -0
  195. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  196. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  197. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  198. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  199. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  200. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  201. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  202. evalscope/benchmarks/mm_star/__init__.py +0 -0
  203. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  204. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
  205. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
  206. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  207. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  208. evalscope/benchmarks/mmmu/__init__.py +0 -0
  209. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  210. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  211. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  212. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  213. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  214. evalscope/benchmarks/multi_if/__init__.py +0 -0
  215. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  216. evalscope/benchmarks/multi_if/metrics.py +120 -0
  217. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  218. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  219. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  220. evalscope/benchmarks/musr/__init__.py +0 -0
  221. evalscope/benchmarks/musr/musr_adapter.py +43 -0
  222. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  223. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
  224. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  225. evalscope/benchmarks/ner/__init__.py +0 -0
  226. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  227. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  228. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  229. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  230. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  231. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  232. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  233. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  234. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  235. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  236. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  237. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  238. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  239. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  240. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  241. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  242. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  243. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  244. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  245. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  246. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  247. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  248. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  249. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  250. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  251. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  252. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  253. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  254. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  255. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  256. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  257. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  258. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  259. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  260. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  261. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  262. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  263. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  264. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  265. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  266. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  267. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  268. evalscope/benchmarks/piqa/__init__.py +0 -0
  269. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  270. evalscope/benchmarks/poly_math/__init__.py +0 -0
  271. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  272. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  273. evalscope/benchmarks/pope/__init__.py +0 -0
  274. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  275. evalscope/benchmarks/process_bench/__init__.py +0 -0
  276. evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
  277. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  278. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  279. evalscope/benchmarks/qasc/__init__.py +0 -0
  280. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  281. evalscope/benchmarks/race/race_adapter.py +33 -120
  282. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  283. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  284. evalscope/benchmarks/sciq/__init__.py +0 -0
  285. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  286. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  287. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  288. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  289. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
  290. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  291. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  292. evalscope/benchmarks/siqa/__init__.py +0 -0
  293. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  294. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  295. evalscope/benchmarks/super_gpqa/prompt.py +88 -0
  296. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  297. evalscope/benchmarks/super_gpqa/utils.py +86 -0
  298. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  299. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  300. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  301. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  302. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  303. evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
  304. evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
  305. evalscope/benchmarks/text2image/__init__.py +0 -0
  306. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  307. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  308. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  309. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  310. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  311. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  312. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  313. evalscope/benchmarks/tool_bench/utils.py +203 -0
  314. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
  315. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
  316. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  317. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  318. evalscope/benchmarks/winogrande/__init__.py +0 -0
  319. evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  320. evalscope/benchmarks/wmt/__init__.py +0 -0
  321. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  322. evalscope/benchmarks/zerobench/__init__.py +0 -0
  323. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  324. evalscope/cli/cli.py +2 -0
  325. evalscope/cli/start_app.py +12 -2
  326. evalscope/cli/start_eval.py +4 -3
  327. evalscope/cli/start_perf.py +10 -2
  328. evalscope/cli/start_server.py +6 -3
  329. evalscope/collections/__init__.py +27 -3
  330. evalscope/collections/sampler.py +12 -11
  331. evalscope/collections/schema.py +13 -12
  332. evalscope/config.py +218 -147
  333. evalscope/constants.py +78 -82
  334. evalscope/evaluator/__init__.py +1 -1
  335. evalscope/evaluator/evaluator.py +334 -318
  336. evalscope/filters/__init__.py +2 -0
  337. evalscope/filters/extraction.py +126 -0
  338. evalscope/filters/selection.py +57 -0
  339. evalscope/metrics/__init__.py +59 -3
  340. evalscope/metrics/bert_score/__init__.py +0 -0
  341. evalscope/metrics/bert_score/scorer.py +338 -0
  342. evalscope/metrics/bert_score/utils.py +697 -0
  343. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  344. evalscope/metrics/llm_judge.py +211 -0
  345. evalscope/metrics/math_parser.py +545 -0
  346. evalscope/metrics/metric.py +611 -0
  347. evalscope/metrics/metrics.py +112 -23
  348. evalscope/metrics/rouge_metric.py +11 -13
  349. evalscope/metrics/t2v_metrics/__init__.py +0 -0
  350. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  351. evalscope/metrics/t2v_metrics/constants.py +12 -0
  352. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  353. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  354. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  355. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  356. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  357. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
  358. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
  359. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
  360. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
  361. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
  362. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  363. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  364. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
  365. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
  366. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
  367. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  368. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
  369. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
  370. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  371. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  372. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  373. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  374. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  375. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
  376. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  377. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
  378. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  379. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
  380. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
  381. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  382. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  383. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  384. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
  385. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
  386. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
  387. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
  388. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  389. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  390. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
  391. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
  392. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  393. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  394. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  395. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  396. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  397. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  398. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  399. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  400. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  401. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  402. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  403. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  404. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  405. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  406. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  407. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  408. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  409. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  410. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  411. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  412. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  413. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  414. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  415. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
  416. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  417. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
  418. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  419. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  420. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  421. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
  422. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
  423. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
  424. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
  425. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
  426. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
  427. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
  428. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
  429. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  430. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  431. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  432. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
  433. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
  434. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
  435. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  436. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
  437. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
  438. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
  439. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
  440. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
  441. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
  442. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
  443. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  444. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  445. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
  446. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
  447. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
  448. evalscope/metrics/t2v_metrics/score.py +78 -0
  449. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  450. evalscope/models/__init__.py +23 -13
  451. evalscope/models/image_edit_model.py +125 -0
  452. evalscope/models/mockllm.py +65 -0
  453. evalscope/models/model_apis.py +69 -0
  454. evalscope/models/modelscope.py +455 -0
  455. evalscope/models/openai_compatible.py +144 -0
  456. evalscope/models/text2image_model.py +124 -0
  457. evalscope/models/utils/openai.py +708 -0
  458. evalscope/perf/__init__.py +0 -1
  459. evalscope/perf/arguments.py +103 -69
  460. evalscope/perf/benchmark.py +114 -163
  461. evalscope/perf/http_client.py +59 -89
  462. evalscope/perf/main.py +91 -18
  463. evalscope/perf/plugin/__init__.py +3 -2
  464. evalscope/perf/plugin/api/__init__.py +4 -3
  465. evalscope/perf/plugin/api/base.py +27 -7
  466. evalscope/perf/plugin/api/custom_api.py +170 -57
  467. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  468. evalscope/perf/plugin/api/default_api.py +214 -0
  469. evalscope/perf/plugin/api/openai_api.py +120 -41
  470. evalscope/perf/plugin/datasets/__init__.py +10 -6
  471. evalscope/perf/plugin/datasets/base.py +43 -1
  472. evalscope/perf/plugin/datasets/custom.py +22 -3
  473. evalscope/perf/plugin/datasets/flickr8k.py +5 -27
  474. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  475. evalscope/perf/plugin/datasets/line_by_line.py +7 -3
  476. evalscope/perf/plugin/datasets/longalpaca.py +7 -3
  477. evalscope/perf/plugin/datasets/openqa.py +13 -14
  478. evalscope/perf/plugin/datasets/random_dataset.py +67 -0
  479. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  480. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  481. evalscope/perf/plugin/registry.py +36 -16
  482. evalscope/perf/utils/analysis_result.py +24 -23
  483. evalscope/perf/utils/benchmark_util.py +95 -55
  484. evalscope/perf/utils/db_util.py +115 -78
  485. evalscope/perf/utils/local_server.py +12 -47
  486. evalscope/perf/utils/log_utils.py +63 -0
  487. evalscope/perf/utils/rich_display.py +192 -0
  488. evalscope/report/__init__.py +46 -3
  489. evalscope/report/combinator.py +143 -32
  490. evalscope/report/generator.py +74 -34
  491. evalscope/report/report.py +238 -0
  492. evalscope/run.py +71 -46
  493. evalscope/summarizer.py +5 -5
  494. evalscope/third_party/longbench_write/infer.py +1 -1
  495. evalscope/third_party/thinkbench/__init__.py +3 -0
  496. evalscope/third_party/thinkbench/eval.py +441 -0
  497. evalscope/third_party/thinkbench/infer.py +130 -0
  498. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  499. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  500. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  501. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  502. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  503. evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
  504. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  505. evalscope/utils/__init__.py +82 -2
  506. evalscope/utils/argument_utils.py +64 -0
  507. evalscope/utils/chat_service.py +8 -6
  508. evalscope/utils/deprecation_utils.py +53 -0
  509. evalscope/utils/function_utils.py +266 -0
  510. evalscope/utils/import_utils.py +154 -0
  511. evalscope/utils/io_utils.py +336 -8
  512. evalscope/utils/json_schema.py +231 -0
  513. evalscope/utils/logger.py +121 -31
  514. evalscope/utils/model_utils.py +57 -1
  515. evalscope/utils/multi_choices.py +303 -0
  516. evalscope/utils/ner.py +377 -0
  517. evalscope/utils/url_utils.py +65 -0
  518. evalscope/version.py +2 -2
  519. evalscope-1.2.0.dist-info/METADATA +553 -0
  520. evalscope-1.2.0.dist-info/RECORD +628 -0
  521. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  522. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  523. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  524. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  525. evalscope/benchmarks/benchmark.py +0 -76
  526. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  527. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  528. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  529. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  530. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  531. evalscope/benchmarks/data_adapter.py +0 -291
  532. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  533. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  534. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  535. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  536. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  537. evalscope/benchmarks/race/race.py +0 -104
  538. evalscope/benchmarks/race/samples.jsonl +0 -5
  539. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  540. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  541. evalscope/collections/evaluator.py +0 -198
  542. evalscope/evaluator/rating_eval.py +0 -157
  543. evalscope/evaluator/reviewer/__init__.py +0 -1
  544. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  545. evalscope/metrics/code_metric.py +0 -98
  546. evalscope/metrics/named_metrics.py +0 -17
  547. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  548. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  549. evalscope/models/base_adapter.py +0 -52
  550. evalscope/models/chat_adapter.py +0 -138
  551. evalscope/models/choice_adapter.py +0 -211
  552. evalscope/models/custom/__init__.py +0 -3
  553. evalscope/models/custom/custom_model.py +0 -53
  554. evalscope/models/custom/dummy_model.py +0 -63
  555. evalscope/models/custom_adapter.py +0 -67
  556. evalscope/models/local_model.py +0 -74
  557. evalscope/models/model.py +0 -229
  558. evalscope/models/server_adapter.py +0 -111
  559. evalscope/registry/__init__.py +0 -1
  560. evalscope/registry/config/cfg_arena.yaml +0 -77
  561. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  562. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  563. evalscope/registry/config/cfg_single.yaml +0 -78
  564. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  565. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  566. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  567. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  568. evalscope/registry/data/question.jsonl +0 -80
  569. evalscope/registry/tasks/arc.yaml +0 -28
  570. evalscope/registry/tasks/bbh.yaml +0 -26
  571. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  572. evalscope/registry/tasks/ceval.yaml +0 -27
  573. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  574. evalscope/registry/tasks/cmmlu.yaml +0 -27
  575. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  576. evalscope/registry/tasks/general_qa.yaml +0 -27
  577. evalscope/registry/tasks/gsm8k.yaml +0 -29
  578. evalscope/registry/tasks/mmlu.yaml +0 -29
  579. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  580. evalscope/report/app.py +0 -506
  581. evalscope/report/utils.py +0 -133
  582. evalscope/run_arena.py +0 -202
  583. evalscope/utils/arena_utils.py +0 -217
  584. evalscope/utils/completion_parsers.py +0 -82
  585. evalscope/utils/utils.py +0 -301
  586. evalscope-0.10.0.dist-info/METADATA +0 -565
  587. evalscope-0.10.0.dist-info/RECORD +0 -286
  588. tests/__init__.py +0 -1
  589. tests/cli/__init__.py +0 -1
  590. tests/cli/test_collection.py +0 -57
  591. tests/cli/test_run.py +0 -165
  592. tests/perf/__init__.py +0 -1
  593. tests/perf/test_perf.py +0 -101
  594. tests/rag/test_clip_benchmark.py +0 -85
  595. tests/rag/test_mteb.py +0 -138
  596. tests/rag/test_ragas.py +0 -120
  597. tests/swift/__init__.py +0 -1
  598. tests/swift/test_run_swift_eval.py +0 -145
  599. tests/swift/test_run_swift_vlm_eval.py +0 -127
  600. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
  601. tests/test_run_all.py +0 -12
  602. tests/vlm/__init__.py +0 -1
  603. tests/vlm/test_vlmeval.py +0 -60
  604. {tests/rag → evalscope/api}/__init__.py +0 -0
  605. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  606. {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,102 @@
1
+ from pydantic import BaseModel, Field, JsonValue
2
+ from typing import Dict, Literal, Optional, Sequence, Union
3
+
4
+
5
+ class ContentBase(BaseModel):
6
+ internal: Optional[JsonValue] = Field(default=None)
7
+ """Model provider specific payload - typically used to aid transformation back to model types."""
8
+
9
+
10
+ class ContentText(ContentBase):
11
+ """Text content."""
12
+
13
+ type: Literal['text'] = Field(default='text')
14
+ """Type."""
15
+
16
+ text: str
17
+ """Text content."""
18
+
19
+ refusal: Optional[bool] = Field(default=None)
20
+ """Was this a refusal message?"""
21
+
22
+
23
+ class ContentReasoning(ContentBase):
24
+ """Reasoning content.
25
+
26
+ See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
27
+ """ # noqa: E501
28
+
29
+ type: Literal['reasoning'] = Field(default='reasoning')
30
+ """Type."""
31
+
32
+ reasoning: str
33
+ """Reasoning content."""
34
+
35
+ signature: Optional[str] = Field(default=None)
36
+ """Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)""" # noqa: E501
37
+
38
+ redacted: bool = Field(default=False)
39
+ """Indicates that the explicit content of this reasoning block has been redacted."""
40
+
41
+
42
+ class ContentImage(ContentBase):
43
+ """Image content."""
44
+
45
+ type: Literal['image'] = Field(default='image')
46
+ """Type."""
47
+
48
+ image: str
49
+ """Either a URL of the image or the base64 encoded image data."""
50
+
51
+ detail: Literal['auto', 'low', 'high'] = Field(default='auto')
52
+ """Specifies the detail level of the image.
53
+
54
+ Currently only supported for OpenAI. Learn more in the [Vision guide](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
55
+ """ # noqa: E501
56
+
57
+
58
+ class ContentAudio(ContentBase):
59
+ """Audio content."""
60
+
61
+ type: Literal['audio'] = Field(default='audio')
62
+ """Type."""
63
+
64
+ audio: str
65
+ """Audio file path or base64 encoded data URL."""
66
+
67
+ format: Literal['wav', 'mp3']
68
+ """Format of audio data ('mp3' or 'wav')"""
69
+
70
+
71
+ class ContentVideo(ContentBase):
72
+ """Video content."""
73
+
74
+ type: Literal['video'] = Field(default='video')
75
+ """Type."""
76
+
77
+ video: str
78
+ """Audio file path or base64 encoded data URL."""
79
+
80
+ format: Literal['mp4', 'mpeg', 'mov']
81
+ """Format of video data ('mp4', 'mpeg', or 'mov')"""
82
+
83
+
84
+ class ContentData(ContentBase):
85
+ """Model internal."""
86
+
87
+ type: Literal['data'] = Field(default='data')
88
+ """Type."""
89
+
90
+ data: Dict[str, JsonValue]
91
+ """Model provider specific payload - required for internal content."""
92
+
93
+
94
+ Content = Union[
95
+ ContentText,
96
+ ContentReasoning,
97
+ ContentImage,
98
+ ContentAudio,
99
+ ContentVideo,
100
+ ContentData,
101
+ ]
102
+ """Content sent to or received from a model."""
@@ -0,0 +1,35 @@
1
+ import re
2
+ from typing import Optional
3
+
4
+ from .content import ContentReasoning
5
+
6
+
7
+ def parse_content_with_reasoning(content: str) -> tuple[str, Optional[ContentReasoning]]:
8
+ """
9
+ Looks for and extracts <think/> tags into reasoning text.
10
+
11
+ Returns a tuple:
12
+ - The first element is the input content with the <think> tag and its contents fully removed.
13
+ - The second element is a ContentReasoning object (or None if no <think> tag is found).
14
+ """
15
+ # Match <think> tag with optional attributes anywhere in the string
16
+ pattern = (r'<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>')
17
+ match = re.search(pattern, content, re.DOTALL)
18
+
19
+ if match:
20
+ signature = match.group(1) # This will be None if not present
21
+ redacted_value = match.group(2) # This will be "true" or None
22
+ reasoning = match.group(3).strip()
23
+ # Remove the matched <think>...</think> from the input
24
+ start, end = match.span()
25
+
26
+ return (
27
+ (content[:start] + content[end:]).strip(),
28
+ ContentReasoning(
29
+ reasoning=reasoning,
30
+ signature=signature,
31
+ redacted=redacted_value == 'true',
32
+ ),
33
+ )
34
+ else:
35
+ return content, None
@@ -0,0 +1,2 @@
1
+ from .metric import Metric, SingletonMetric, T2IMetric
2
+ from .scorer import Aggregator, AggScore, SampleScore, Score, Value
@@ -0,0 +1,60 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable, Iterable, List, Union
3
+
4
+ from evalscope.utils import get_logger
5
+ from evalscope.utils.function_utils import thread_safe
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ class Metric(ABC):
11
+ """
12
+ Metric classes operate on a sample level.
13
+ """
14
+
15
+ def __init__(self, *args, **kwargs) -> None:
16
+ """
17
+ Can define custom behavior here, if an individual instantiation of a Metric class should have state.
18
+ """
19
+
20
+ @abstractmethod
21
+ def apply(self, predictions: List[str], references: List[str]) -> List[float]:
22
+ pass
23
+
24
+ def __call__(self, prediction: str, reference: str) -> float:
25
+ """
26
+ Allows the metric to be called like a function.
27
+ """
28
+ return self.apply([prediction], [reference])[0]
29
+
30
+
31
+ class SingletonMetric(Metric):
32
+ """Singleton base class for metrics."""
33
+ _instance = None
34
+
35
+ @thread_safe
36
+ def __new__(cls, *args, **kwargs):
37
+ if cls._instance is None:
38
+ cls._instance = super().__new__(cls)
39
+ return cls._instance
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ cls = self.__class__
43
+ if hasattr(self, '_init_done'):
44
+ return
45
+ logger.info(f'Initializing {cls.__name__}...')
46
+ self._init_once(*args, **kwargs)
47
+ self._init_done = True
48
+
49
+ def _init_once(self, *args, **kwargs):
50
+ pass
51
+
52
+
53
+ class T2IMetric(SingletonMetric):
54
+ """Singleton base class for T2I metrics."""
55
+
56
+ def apply(self, images: List[str], texts: List[str], **kwargs) -> List[Union[float, dict]]:
57
+ pass
58
+
59
+ def __call__(self, image: str, text: str, **kwargs) -> Union[float, dict]:
60
+ return self.apply([image], [text], **kwargs)[0]
@@ -0,0 +1,113 @@
1
+ from pydantic import BaseModel, Field
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
3
+
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ logger = get_logger()
7
+
8
+ Value = Dict[str, Union[int, float, bool]]
9
+
10
+
11
+ class Score(BaseModel):
12
+ """Score generated by a scorer."""
13
+
14
+ value: Value = Field(default_factory=dict)
15
+ """Score value as a dictionary. Key is the score name, value is the score value.
16
+ The first key is considered the main score by default."""
17
+
18
+ extracted_prediction: Optional[str] = Field(default=None)
19
+ """Answer extracted from model output (optional)"""
20
+
21
+ prediction: Optional[str] = Field(default=None)
22
+ """Original prediction text from the model (optional)"""
23
+
24
+ explanation: Optional[str] = Field(default=None)
25
+ """Explanation of score (optional)."""
26
+
27
+ metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
28
+ """Additional metadata related to the score"""
29
+
30
+ main_score_name: Optional[str] = Field(default=None)
31
+ """Main score name, if applicable. This is used to indicate which score is the primary score in a multi-score scenario.""" # noqa: E501
32
+
33
+ @property
34
+ def main_value(self) -> Union[int, float, bool]:
35
+ """Main score value."""
36
+ if self.main_score_name and self.main_score_name in self.value:
37
+ return self.value[self.main_score_name]
38
+ elif self.value:
39
+ # If main_score_name is not set or not found, use the first value and update main_score_name
40
+ first_key = next(iter(self.value))
41
+ self.main_score_name = first_key
42
+ return self.value[first_key]
43
+ return None
44
+
45
+ @main_value.setter
46
+ def main_value(self, value: Union[int, float, bool]):
47
+ """Set the main score value."""
48
+ if self.main_score_name:
49
+ # If main_score_name is already set, use it
50
+ self.value[self.main_score_name] = value
51
+ elif self.value:
52
+ # If no main_score_name but value dict exists, use the first key
53
+ first_key = next(iter(self.value))
54
+ self.main_score_name = first_key
55
+ self.value[first_key] = value
56
+ else:
57
+ # If neither main_score_name nor value dict exists, initialize both
58
+ self.main_score_name = 'default'
59
+ self.value[self.main_score_name] = value
60
+
61
+
62
+ class SampleScore(BaseModel):
63
+ """Score for a Sample."""
64
+
65
+ score: Score
66
+ """A score"""
67
+
68
+ sample_id: Optional[Union[str, int]] = Field(default=None)
69
+ """A sample id"""
70
+
71
+ group_id: Optional[Union[str, int]] = Field(default=None)
72
+ """A group id for the sample, used for grouping k repeated samples."""
73
+
74
+ sample_metadata: Optional[Dict[str, Any]] = Field(default=None)
75
+ """Metadata from the sample"""
76
+
77
+
78
+ class AggScore(BaseModel):
79
+ """Output of an aggregation operation."""
80
+
81
+ score: float = Field(default=0.0)
82
+ """Aggregated value as a float."""
83
+
84
+ metric_name: str = Field(default='')
85
+ """Name of the metric being aggregated."""
86
+
87
+ aggregation_name: str = Field(default='')
88
+ """Name of the aggregation methods"""
89
+
90
+ num: int = Field(default=0)
91
+ """Number of samples used in the aggregation."""
92
+
93
+ ids: Optional[List[Union[str, int]]] = Field(default=None)
94
+ """List of sample IDs used in the aggregation, if applicable."""
95
+
96
+ metadata: Optional[Dict[str, Any]] = Field(default=None)
97
+ """Additional metadata related to the aggregation."""
98
+
99
+
100
+ class Aggregator:
101
+
102
+ name = 'default'
103
+
104
+ def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
105
+ r"""Aggregate a metric on a list of scores.
106
+
107
+ Args:
108
+ scores: List of scores.
109
+
110
+ Returns:
111
+ List[AggregatOutput]: List of aggregated outputs.
112
+ """
113
+ ...
@@ -0,0 +1,2 @@
1
+ from .llm_judge_mixin import LLMJudgeMixin
2
+ from .sandbox_mixin import SandboxMixin
@@ -0,0 +1,170 @@
1
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
2
+
3
+ from evalscope.api.evaluator import TaskState
4
+ from evalscope.api.metric import Score
5
+ from evalscope.constants import JudgeStrategy
6
+ from evalscope.metrics import LLMJudge
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ if TYPE_CHECKING:
10
+ from evalscope.config import TaskConfig
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class LLMJudgeMixin:
16
+ """
17
+ Mixin class for LLM Judge functionality.
18
+ """
19
+
20
+ def __init__(self, task_config: 'TaskConfig'):
21
+ self._task_config = task_config
22
+ self._use_llm_judge = False
23
+ """Whether to use LLM as a judge"""
24
+
25
+ self._llm_judge: Optional[LLMJudge] = None
26
+
27
+ super().__init__(task_config=task_config)
28
+
29
+ @property
30
+ def llm_judge(self) -> Optional[LLMJudge]:
31
+ """Get LLM judge instance with lazy initialization."""
32
+ if self._llm_judge is None and self.use_llm_judge:
33
+ self._llm_judge = self.init_llm_judge()
34
+ return self._llm_judge
35
+
36
+ @llm_judge.setter
37
+ def llm_judge(self, value: Optional[LLMJudge]):
38
+ """Set LLM judge instance."""
39
+ self._llm_judge = value
40
+
41
+ @property
42
+ def judge_strategy(self) -> str:
43
+ """Get the judge strategy from the task configuration."""
44
+ return self._task_config.judge_strategy
45
+
46
+ @property
47
+ def use_llm_judge(self) -> bool:
48
+ """Check if LLM judge is enabled."""
49
+ if self.judge_strategy == JudgeStrategy.RULE:
50
+ return False
51
+ elif self.judge_strategy == JudgeStrategy.LLM:
52
+ return True
53
+ elif self.judge_strategy == JudgeStrategy.LLM_RECALL:
54
+ return True
55
+ elif self.judge_strategy == JudgeStrategy.AUTO:
56
+ return self._use_llm_judge
57
+ else:
58
+ logger.warning(f'Unknown judge strategy: {self.judge_strategy}. Defaulting to False.')
59
+ return False
60
+
61
+ def init_llm_judge(self) -> Optional[LLMJudge]:
62
+ """
63
+ Initialize the LLM judge for the benchmark.
64
+
65
+ Returns:
66
+ Optional[LLMJudge]: The initialized LLM judge instance or None
67
+ """
68
+
69
+ if self.judge_strategy == JudgeStrategy.RULE:
70
+ return None
71
+ else:
72
+ return LLMJudge(**self._task_config.judge_model_args)
73
+
74
+ def maybe_llm_match_score(
75
+ self,
76
+ original_prediction: str,
77
+ filtered_prediction: str,
78
+ reference: str,
79
+ task_state: TaskState,
80
+ rule_based_score: Optional[Score] = None,
81
+ ) -> Score:
82
+ """
83
+ Compute the match score between the original and filtered predictions against the reference.
84
+
85
+ Args:
86
+ original_prediction: The original prediction output from the model.
87
+ filtered_prediction: The filtered prediction output from the model.
88
+ reference: The ground truth reference output.
89
+ task_state: The current task state.
90
+ original_score: Optional original score to be used for comparison.
91
+
92
+ Returns:
93
+ Score: The computed match score.
94
+ """
95
+ # If LLM judge is not used, return the rule-based score directly
96
+ if not self.use_llm_judge:
97
+ return rule_based_score
98
+
99
+ # For LLM_RECALL, if rule-based score is already perfect, skip LLM judge
100
+ if float(rule_based_score.main_value) > 0.99:
101
+ return rule_based_score
102
+
103
+ # Compute LLM judge score
104
+ llm_score = self.llm_match_score(
105
+ original_prediction=original_prediction,
106
+ filtered_prediction=filtered_prediction,
107
+ reference=reference,
108
+ task_state=task_state,
109
+ )
110
+
111
+ # For LLM RECALL, merge the scores
112
+ return self._merge_scores(rule_based_score, llm_score)
113
+
114
+ def llm_match_score(
115
+ self,
116
+ original_prediction: str,
117
+ filtered_prediction: str,
118
+ reference: str,
119
+ task_state: TaskState,
120
+ ) -> Score:
121
+ """Compute the LLM match score.
122
+
123
+ Args:
124
+ original_prediction (str): The original prediction output from the model.
125
+ filtered_prediction (str): The filtered prediction output from the model.
126
+ reference (str): The ground truth reference output.
127
+ task_state (TaskState): The current task state.
128
+
129
+ Returns:
130
+ Score: The computed match score.
131
+ """
132
+ score = Score(
133
+ extracted_prediction=filtered_prediction,
134
+ prediction=original_prediction,
135
+ )
136
+
137
+ question = task_state.input_text
138
+
139
+ # Request judge and obtain score
140
+ prompt = self.llm_judge.build_prompt(pred=original_prediction, gold=reference, question=question)
141
+ judge_response = self.llm_judge.judge(prompt)
142
+ judge_score = self.llm_judge.get_score(judge_response)
143
+
144
+ score.value = {'acc': judge_score}
145
+ score.explanation = f'LLM judge: {judge_response}'
146
+ score.metadata = {
147
+ 'source': 'llm_judge',
148
+ 'judge_strategy': self.judge_strategy,
149
+ 'model': self.llm_judge.model_id
150
+ }
151
+
152
+ return score
153
+
154
+ def _merge_scores(self, rule_based_score: Score, llm_score: Score) -> Score:
155
+ """
156
+ Merge rule-based score with LLM judge score for LLM_RECALL strategy.
157
+
158
+ Args:
159
+ rule_based_score: The original rule-based score
160
+ llm_score: The LLM judge score
161
+
162
+ Returns:
163
+ Score: The merged score
164
+ """
165
+ # Update the main value with LLM judge result
166
+ rule_based_score.main_value = llm_score.main_value
167
+ rule_based_score.explanation = llm_score.explanation
168
+ rule_based_score.metadata = llm_score.metadata
169
+
170
+ return rule_based_score
@@ -0,0 +1,182 @@
1
+ from typing import TYPE_CHECKING, Any, Dict, Optional
2
+
3
+ from evalscope.utils.function_utils import AsyncioLoopRunner, thread_safe
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ if TYPE_CHECKING:
7
+ from ms_enclave.sandbox.manager import SandboxManager
8
+
9
+ from evalscope.config import TaskConfig
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ class SandboxMixin:
15
+ """Sandbox mixin for sandboxed code execution."""
16
+
17
+ def __init__(self, task_config: 'TaskConfig'):
18
+ self._task_config = task_config
19
+
20
+ self._manager: Optional['SandboxManager'] = None
21
+ """Sandbox manager instance."""
22
+
23
+ self._sandbox_id: Optional[str] = None
24
+ """Sandbox ID."""
25
+
26
+ # Lazy init state
27
+ self._initialized: bool = False
28
+
29
+ # NOTE: Initialization is deferred.
30
+ super().__init__()
31
+
32
+ async def _async_init(self):
33
+ """Async initialization helper."""
34
+ await self.init_sandbox_manager_async()
35
+ await self.init_sandbox_async()
36
+
37
+ @property
38
+ def use_sandbox(self) -> bool:
39
+ """
40
+ Return whether to use sandbox for the benchmark.
41
+ """
42
+ if not self._task_config:
43
+ return False
44
+ else:
45
+ return self._task_config.use_sandbox
46
+
47
+ @property
48
+ def sandbox_manager(self) -> Optional['SandboxManager']:
49
+ """Get the sandbox manager instance."""
50
+ return self._manager
51
+
52
+ @property
53
+ def sandbox_id(self) -> Optional[str]:
54
+ """Get the sandbox ID."""
55
+ return self._sandbox_id
56
+
57
+ @thread_safe
58
+ def ensure_sandbox_ready(self) -> bool:
59
+ """
60
+ Ensure the sandbox loop, manager, and sandbox instance are initialized.
61
+ This method is thread-safe and idempotent.
62
+ """
63
+ if not self.use_sandbox:
64
+ return False
65
+
66
+ if self._initialized and self._manager and self._sandbox_id:
67
+ return True
68
+
69
+ # Initialize manager and sandbox using the class-level runner
70
+ AsyncioLoopRunner.run(self.init_sandbox_manager_async())
71
+ AsyncioLoopRunner.run(self.init_sandbox_async())
72
+
73
+ self._initialized = True
74
+ return True
75
+
76
+ async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
77
+ """Initialize the sandbox manager asynchronously."""
78
+ if self._manager is not None:
79
+ return self._manager
80
+
81
+ if not self.use_sandbox:
82
+ return None
83
+
84
+ from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
85
+
86
+ manager_config = self._task_config.sandbox_manager_config or {}
87
+ if manager_config.get('base_url'):
88
+ # Remote manager
89
+ self._manager = HttpSandboxManager(**manager_config)
90
+ else:
91
+ # Local manager
92
+ self._manager = LocalSandboxManager(**manager_config)
93
+
94
+ await self._manager.start()
95
+ logger.info('Sandbox manager initialized.')
96
+ return self._manager
97
+
98
+ def init_sandbox_manager(self) -> Optional['SandboxManager']:
99
+ """Initialize the sandbox manager."""
100
+ if self._manager is not None:
101
+ return self._manager
102
+
103
+ if not self.use_sandbox:
104
+ return None
105
+
106
+ return AsyncioLoopRunner.run(self.init_sandbox_manager_async())
107
+
108
+ async def init_sandbox_async(self) -> Optional[str]:
109
+ """Initialize the sandbox instance asynchronously."""
110
+ if self._sandbox_id is not None:
111
+ return self._sandbox_id
112
+
113
+ if not self.use_sandbox:
114
+ return None
115
+
116
+ from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
117
+
118
+ sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
119
+ image='python:3.11-slim', tools_config={
120
+ 'shell_executor': {},
121
+ 'python_executor': {}
122
+ }
123
+ )
124
+ sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
125
+
126
+ self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
127
+
128
+ sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
129
+
130
+ logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
131
+ return self._sandbox_id
132
+
133
+ def init_sandbox(self) -> Optional[str]:
134
+ """Initialize the sandbox instance."""
135
+ if self._sandbox_id is not None:
136
+ return self._sandbox_id
137
+
138
+ if not self.use_sandbox:
139
+ return None
140
+
141
+ return AsyncioLoopRunner.run(self.init_sandbox_async())
142
+
143
+ def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
144
+ """Execute code in the sandbox."""
145
+ # Lazy, thread-safe initialization
146
+ if not self.ensure_sandbox_ready():
147
+ logger.warning('Sandbox is not initialized.')
148
+ return {'error': 'Sandbox is not initialized.'}
149
+
150
+ from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
151
+
152
+ async def _execute_async():
153
+ if language.lower() == 'python':
154
+ tool_name = 'python_executor'
155
+ parameters = {'code': code, 'timeout': timeout}
156
+ result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
157
+ elif language.lower() == 'shell':
158
+ tool_name = 'shell_executor'
159
+ parameters = {'command': code, 'timeout': timeout}
160
+ result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
161
+ else:
162
+ logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
163
+ result = ToolResult(
164
+ status=ExecutionStatus.ERROR,
165
+ tool_name='code_executor',
166
+ output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
167
+ )
168
+ return result
169
+
170
+ # Execute in background loop via class-level runner
171
+ result = AsyncioLoopRunner.run(_execute_async(), timeout=timeout + 10)
172
+ return result.model_dump(exclude_none=True)
173
+
174
+ def sandbox_finalize(self, *args, **kwargs):
175
+ """Finalize the sandbox manager."""
176
+ if self._manager:
177
+ try:
178
+ # Stop the manager but keep the shared loop alive
179
+ AsyncioLoopRunner.run(self._manager.stop(), timeout=30)
180
+ logger.info('Sandbox manager finalized.')
181
+ except Exception as e:
182
+ logger.warning(f'Error finalizing sandbox manager: {e}')
@@ -0,0 +1,12 @@
1
+ from .generate_config import GenerateConfig
2
+ from .model import Model, ModelAPI, get_model, get_model_with_task_config
3
+ from .model_output import (
4
+ ChatCompletionChoice,
5
+ Logprob,
6
+ Logprobs,
7
+ ModelOutput,
8
+ ModelUsage,
9
+ StopReason,
10
+ TopLogprob,
11
+ as_stop_reason,
12
+ )