evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,307 @@
1
+ """
2
+ Answer checker API that uses sympy to simplify expressions and check for equality.
3
+
4
+ Call grade_answer(given_answer: str, ground_truth: str).
5
+
6
+ This file is adapted from OpenAI's PRM800K repository:
7
+ https://github.com/openai/prm800k/blob/main/prm800k/grading/grader.py
8
+
9
+ Original License:
10
+ MIT License
11
+
12
+ Copyright (c) 2023 OpenAI
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE.
31
+ """
32
+ # flake8: noqa
33
+ import re
34
+ import sympy
35
+ from pylatexenc import latex2text
36
+ from sympy.parsing import sympy_parser
37
+
38
+ from . import math_normalize
39
+
40
+ # sympy might hang -- we don't care about trying to be lenient in these cases
41
+ BAD_SUBSTRINGS = ['^{', '^(']
42
+ BAD_REGEXES = ['\^[0-9]+\^', '\^[0-9][0-9]+']
43
+ TUPLE_CHARS = '()[]'
44
+
45
+
46
+ def _sympy_parse(expr: str):
47
+ """Parses an expression with sympy."""
48
+ py_expr = expr.replace('^', '**')
49
+ return sympy_parser.parse_expr(
50
+ py_expr,
51
+ transformations=(sympy_parser.standard_transformations + (sympy_parser.implicit_multiplication_application, )),
52
+ )
53
+
54
+
55
+ def _parse_latex(expr: str) -> str:
56
+ """Attempts to parse latex to an expression sympy can read."""
57
+ expr = expr.replace('\\tfrac', '\\frac')
58
+ expr = expr.replace('\\dfrac', '\\frac')
59
+ expr = expr.replace('\\frac', ' \\frac') # Play nice with mixed numbers.
60
+ expr = latex2text.LatexNodes2Text().latex_to_text(expr)
61
+
62
+ # Replace the specific characters that this parser uses.
63
+ expr = expr.replace('√', 'sqrt')
64
+ expr = expr.replace('π', 'pi')
65
+ expr = expr.replace('∞', 'inf')
66
+ expr = expr.replace('∪', 'U')
67
+ expr = expr.replace('·', '*')
68
+ expr = expr.replace('×', '*')
69
+
70
+ return expr.strip()
71
+
72
+
73
+ def _is_float(num: str) -> bool:
74
+ try:
75
+ float(num)
76
+ return True
77
+ except ValueError:
78
+ return False
79
+
80
+
81
+ def _is_int(x: float) -> bool:
82
+ try:
83
+ return abs(x - int(round(x))) <= 1e-7
84
+ except:
85
+ return False
86
+
87
+
88
+ def _is_frac(expr: str) -> bool:
89
+ return bool(re.search(r'^-?[0-9]+.?/0*[1-9][0-9]*.?$', expr))
90
+
91
+
92
+ def _str_is_int(x: str) -> bool:
93
+ try:
94
+ x = _strip_properly_formatted_commas(x)
95
+ x = float(x)
96
+ return abs(x - int(round(x))) <= 1e-7
97
+ except:
98
+ return False
99
+
100
+
101
+ def _str_to_int(x: str) -> bool:
102
+ x = x.replace(',', '')
103
+ x = float(x)
104
+ return int(x)
105
+
106
+
107
+ def _inject_implicit_mixed_number(step: str):
108
+ """
109
+ Automatically make a mixed number evalable
110
+ e.g. 7 3/4 => 7+3/4
111
+ """
112
+ p1 = re.compile('([0-9]) +([0-9])')
113
+ step = p1.sub('\\1+\\2', step) ## implicit mults
114
+ return step
115
+
116
+
117
+ def _strip_properly_formatted_commas(expr: str):
118
+ # We want to be careful because we don't want to strip tuple commas
119
+ p1 = re.compile('(\d)(,)(\d\d\d)($|\D)')
120
+ while True:
121
+ next_expr = p1.sub('\\1\\3\\4', expr)
122
+ if next_expr == expr:
123
+ break
124
+ expr = next_expr
125
+ return next_expr
126
+
127
+
128
+ def _normalize(expr: str) -> str:
129
+ """Normalize answer expressions."""
130
+ if expr is None:
131
+ return None
132
+
133
+ # Remove enclosing `\text{}`.
134
+ m = re.search('^\\\\text\{(?P<text>.+?)\}$', expr)
135
+ if m is not None:
136
+ expr = m.group('text')
137
+
138
+ expr = expr.replace('\\%', '%')
139
+ expr = expr.replace('\\$', '$')
140
+ expr = expr.replace('$', '')
141
+ expr = expr.replace('%', '')
142
+ expr = expr.replace(' or ', ' , ')
143
+ expr = expr.replace(' and ', ' , ')
144
+
145
+ expr = expr.replace('million', '*10^6')
146
+ expr = expr.replace('billion', '*10^9')
147
+ expr = expr.replace('trillion', '*10^12')
148
+
149
+ for unit in [
150
+ 'degree',
151
+ 'cm',
152
+ 'centimeter',
153
+ 'meter',
154
+ 'mile',
155
+ 'second',
156
+ 'minute',
157
+ 'hour',
158
+ 'day',
159
+ 'week',
160
+ 'month',
161
+ 'year',
162
+ 'foot',
163
+ 'feet',
164
+ 'inch',
165
+ 'yard',
166
+ ]:
167
+ expr = re.sub(f'{unit}(es)?(s)? *(\^[0-9]+)?', '', expr)
168
+ expr = re.sub(f'\^ *\\\\circ', '', expr)
169
+
170
+ if len(expr) > 0 and expr[0] == '{' and expr[-1] == '}':
171
+ expr = expr[1:-1]
172
+
173
+ expr = re.sub(',\\\\! *', '', expr)
174
+ if _is_float(expr) and _is_int(float(expr)):
175
+ expr = str(int(round(float(expr))))
176
+ if '\\' in expr:
177
+ try:
178
+ expr = _parse_latex(expr)
179
+ except:
180
+ pass
181
+
182
+ # edge case with mixed numbers and negative signs
183
+ expr = re.sub('- *', '-', expr)
184
+
185
+ expr = _inject_implicit_mixed_number(expr)
186
+ expr = expr.replace(' ', '')
187
+
188
+ # if we somehow still have latex braces here, just drop them
189
+ expr = expr.replace('{', '')
190
+ expr = expr.replace('}', '')
191
+
192
+ # don't be case sensitive for text answers
193
+ expr = expr.lower()
194
+
195
+ if _str_is_int(expr):
196
+ expr = str(_str_to_int(expr))
197
+
198
+ return expr
199
+
200
+
201
+ def count_unknown_letters_in_expr(expr: str):
202
+ expr = expr.replace('sqrt', '')
203
+ expr = expr.replace('frac', '')
204
+ letters_in_expr = set([x for x in expr if x.isalpha()])
205
+ return len(letters_in_expr)
206
+
207
+
208
+ def should_allow_eval(expr: str):
209
+ # we don't want to try parsing unknown text or functions of more than two variables
210
+ if count_unknown_letters_in_expr(expr) > 2:
211
+ return False
212
+
213
+ for bad_string in BAD_SUBSTRINGS:
214
+ if bad_string in expr:
215
+ return False
216
+
217
+ for bad_regex in BAD_REGEXES:
218
+ if re.search(bad_regex, expr) is not None:
219
+ return False
220
+
221
+ return True
222
+
223
+
224
+ def are_equal_under_sympy(ground_truth_normalized: str, given_normalized: str):
225
+ are_equal = False
226
+ try:
227
+ expr = f'({ground_truth_normalized})-({given_normalized})'
228
+ if should_allow_eval(expr):
229
+ sympy_diff = _sympy_parse(expr)
230
+ simplified = sympy.simplify(sympy_diff)
231
+ if simplified == 0:
232
+ are_equal = True
233
+ except:
234
+ pass
235
+ return are_equal
236
+
237
+
238
+ def split_tuple(expr: str):
239
+ """
240
+ Split the elements in a tuple/interval, while handling well-formatted commas in large numbers
241
+ """
242
+ expr = _strip_properly_formatted_commas(expr)
243
+ if len(expr) == 0:
244
+ return []
245
+ if (
246
+ len(expr) > 2 and expr[0] in TUPLE_CHARS and expr[-1] in TUPLE_CHARS
247
+ and all([ch not in expr[1:-1] for ch in TUPLE_CHARS])
248
+ ):
249
+ elems = [elem.strip() for elem in expr[1:-1].split(',')]
250
+ else:
251
+ elems = [expr]
252
+ return elems
253
+
254
+
255
+ def grade_answer(given_answer: str, ground_truth: str) -> bool:
256
+ """
257
+ The answer will be considered correct if:
258
+ (a) it normalizes to the same string as the ground truth answer
259
+ OR
260
+ (b) sympy can simplify the difference between the expressions to 0
261
+ """
262
+ if given_answer is None:
263
+ return False
264
+
265
+ ground_truth_normalized_mathd = math_normalize.normalize_answer(ground_truth)
266
+ given_answer_normalized_mathd = math_normalize.normalize_answer(given_answer)
267
+
268
+ # be at least as lenient as mathd
269
+ if ground_truth_normalized_mathd == given_answer_normalized_mathd:
270
+ return True
271
+
272
+ ground_truth_normalized = _normalize(ground_truth)
273
+ given_normalized = _normalize(given_answer)
274
+
275
+ if ground_truth_normalized is None:
276
+ return False
277
+
278
+ if ground_truth_normalized == given_normalized:
279
+ return True
280
+
281
+ if len(given_normalized) == 0:
282
+ return False
283
+
284
+ ground_truth_elems = split_tuple(ground_truth_normalized)
285
+ given_elems = split_tuple(given_normalized)
286
+
287
+ if len(ground_truth_elems) > 1 and (
288
+ ground_truth_normalized[0] != given_normalized[0] or ground_truth_normalized[-1] != given_normalized[-1]
289
+ ):
290
+ is_correct = False
291
+ elif len(ground_truth_elems) != len(given_elems):
292
+ is_correct = False
293
+ else:
294
+ for ground_truth_elem, given_elem in zip(ground_truth_elems, given_elems):
295
+ if _is_frac(ground_truth_elem) and _is_frac(given_elem):
296
+ # if fractions aren't reduced, then shouldn't be marked as correct
297
+ # so, we don't want to allow sympy.simplify in this case
298
+ is_correct = ground_truth_elem == given_elem
299
+ elif _str_is_int(ground_truth_elem) != _str_is_int(given_elem):
300
+ # if the ground truth answer is an integer, we require the given answer to be a strict match (no sympy.simplify)
301
+ is_correct = False
302
+ else:
303
+ is_correct = are_equal_under_sympy(ground_truth_elem, given_elem)
304
+ if not is_correct:
305
+ break
306
+
307
+ return is_correct
@@ -0,0 +1,189 @@
1
+ """
2
+ This logic is largely copied from the Hendrycks' MATH release (math_equivalence).
3
+
4
+
5
+ This file is adapted from OpenAI's PRM800K repository:
6
+ https://github.com/openai/prm800k/blob/main/prm800k/grading/math_normalize.py
7
+
8
+ Original License:
9
+ MIT License
10
+
11
+ Copyright (c) 2023 OpenAI
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ """
31
+ # flake8: noqa
32
+ import re
33
+ from typing import Optional
34
+
35
+
36
+ def normalize_answer(answer: Optional[str]) -> Optional[str]:
37
+ if answer is None:
38
+ return None
39
+ answer = answer.strip()
40
+ try:
41
+ # Remove enclosing `\text{}`.
42
+ m = re.search('^\\\\text\{(?P<text>.+?)\}$', answer)
43
+ if m is not None:
44
+ answer = m.group('text').strip()
45
+ return _strip_string(answer)
46
+ except:
47
+ return answer
48
+
49
+
50
+ def _fix_fracs(string):
51
+ substrs = string.split('\\frac')
52
+ new_str = substrs[0]
53
+ if len(substrs) > 1:
54
+ substrs = substrs[1:]
55
+ for substr in substrs:
56
+ new_str += '\\frac'
57
+ if substr[0] == '{':
58
+ new_str += substr
59
+ else:
60
+ try:
61
+ assert len(substr) >= 2
62
+ except:
63
+ return string
64
+ a = substr[0]
65
+ b = substr[1]
66
+ if b != '{':
67
+ if len(substr) > 2:
68
+ post_substr = substr[2:]
69
+ new_str += '{' + a + '}{' + b + '}' + post_substr
70
+ else:
71
+ new_str += '{' + a + '}{' + b + '}'
72
+ else:
73
+ if len(substr) > 2:
74
+ post_substr = substr[2:]
75
+ new_str += '{' + a + '}' + b + post_substr
76
+ else:
77
+ new_str += '{' + a + '}' + b
78
+ string = new_str
79
+ return string
80
+
81
+
82
+ def _fix_a_slash_b(string):
83
+ if len(string.split('/')) != 2:
84
+ return string
85
+ a = string.split('/')[0]
86
+ b = string.split('/')[1]
87
+ try:
88
+ a = int(a)
89
+ b = int(b)
90
+ assert string == '{}/{}'.format(a, b)
91
+ new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
92
+ return new_string
93
+ except:
94
+ return string
95
+
96
+
97
+ def _remove_right_units(string):
98
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
99
+ if '\\text{ ' in string:
100
+ splits = string.split('\\text{ ')
101
+ assert len(splits) == 2
102
+ return splits[0]
103
+ else:
104
+ return string
105
+
106
+
107
+ def _fix_sqrt(string):
108
+ if '\\sqrt' not in string:
109
+ return string
110
+ splits = string.split('\\sqrt')
111
+ new_string = splits[0]
112
+ for split in splits[1:]:
113
+ if split[0] != '{':
114
+ a = split[0]
115
+ new_substr = '\\sqrt{' + a + '}' + split[1:]
116
+ else:
117
+ new_substr = '\\sqrt' + split
118
+ new_string += new_substr
119
+ return new_string
120
+
121
+
122
+ def _strip_string(string):
123
+ # linebreaks
124
+ string = string.replace('\n', '')
125
+ # print(string)
126
+
127
+ # remove inverse spaces
128
+ string = string.replace('\\!', '')
129
+ # print(string)
130
+
131
+ # replace \\ with \
132
+ string = string.replace('\\\\', '\\')
133
+ # print(string)
134
+
135
+ # replace tfrac and dfrac with frac
136
+ string = string.replace('tfrac', 'frac')
137
+ string = string.replace('dfrac', 'frac')
138
+ # print(string)
139
+
140
+ # remove \left and \right
141
+ string = string.replace('\\left', '')
142
+ string = string.replace('\\right', '')
143
+ # print(string)
144
+
145
+ # Remove circ (degrees)
146
+ string = string.replace('^{\\circ}', '')
147
+ string = string.replace('^\\circ', '')
148
+
149
+ # remove dollar signs
150
+ string = string.replace('\\$', '')
151
+
152
+ # remove units (on the right)
153
+ string = _remove_right_units(string)
154
+
155
+ # remove percentage
156
+ string = string.replace('\\%', '')
157
+ string = string.replace('\%', '')
158
+
159
+ # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
160
+ string = string.replace(' .', ' 0.')
161
+ string = string.replace('{.', '{0.')
162
+ # if empty, return empty string
163
+ if len(string) == 0:
164
+ return string
165
+ if string[0] == '.':
166
+ string = '0' + string
167
+
168
+ # to consider: get rid of e.g. "k = " or "q = " at beginning
169
+ if len(string.split('=')) == 2:
170
+ if len(string.split('=')[0]) <= 2:
171
+ string = string.split('=')[1]
172
+
173
+ # fix sqrt3 --> sqrt{3}
174
+ string = _fix_sqrt(string)
175
+
176
+ # remove spaces
177
+ string = string.replace(' ', '')
178
+
179
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
180
+ string = _fix_fracs(string)
181
+
182
+ # manually change 0.5 --> \frac{1}{2}
183
+ if string == '0.5':
184
+ string = '\\frac{1}{2}'
185
+
186
+ # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
187
+ string = _fix_a_slash_b(string)
188
+
189
+ return string
@@ -0,0 +1,51 @@
1
+ from typing import Any, Dict
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ @register_benchmark(
13
+ BenchmarkMeta(
14
+ name='amc',
15
+ pretty_name='AMC',
16
+ tags=[Tags.MATH, Tags.REASONING],
17
+ description=
18
+ 'AMC (American Mathematics Competitions) is a series of mathematics competitions for high school students.',
19
+ dataset_id='evalscope/amc_22-24',
20
+ subset_list=['amc22', 'amc23', 'amc24'],
21
+ metric_list=[{
22
+ 'acc': {
23
+ 'numeric': True
24
+ }
25
+ }],
26
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
27
+ )
28
+ )
29
+ class AMCAdapter(DefaultDataAdapter):
30
+
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+
34
+ # Use split as subset
35
+ self.split_as_subset = True
36
+
37
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
38
+ return Sample(
39
+ input=record['problem'],
40
+ target=record['answer'],
41
+ metadata={
42
+ 'year': record['year'],
43
+ 'url': record['url'],
44
+ 'solution': record.get('solution', '')
45
+ },
46
+ )
47
+
48
+ def extract_answer(self, prediction: str, task_state):
49
+ from evalscope.metrics.math_parser import extract_answer
50
+
51
+ return extract_answer(prediction)
@@ -29,6 +29,7 @@ GRADER_TEMPLATE = """<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's
29
29
  'Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.',
30
30
  dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
31
31
  metric_list=['winrate'],
32
+ aggregation='elo',
32
33
  few_shot_num=0,
33
34
  train_split=None,
34
35
  eval_split='test',
@@ -141,35 +141,61 @@ class BBHAdapter(DefaultDataAdapter):
141
141
  @classmethod
142
142
  def _extract_mc_answer(cls, ans: str) -> str:
143
143
  """
144
- Extract the answer from the model output for Multiple choice task.
144
+ Extract normalized answer for BBH multiple-choice tasks.
145
+ Handles formats like:
146
+ - "answer is (A)"
147
+ - "The answer is A."
148
+ - Extra text after answer.
149
+ Always uses the *last* occurrence of "answer is".
145
150
  """
146
- ans_line = ans.split('answer is ')
147
- if len(ans_line) != 1:
148
- ans = ans_line[1].strip()
149
- match = re.search(r'\(([A-Z])\)*', ans)
151
+ ans = ans.strip()
152
+
153
+ parts = ans.split('So the answer is ')
154
+ if len(parts) > 1:
155
+ ans = parts[-1].strip()
156
+ ans = ans.split('\n')[0].strip()
157
+
158
+ # Remove trailing period
159
+ if ans.endswith('.'):
160
+ ans = ans[:-1].strip()
161
+
162
+ # Capture uppercase letter inside parentheses (A) (B) ...
163
+ match = re.search(r'\(([A-Z])\)', ans)
150
164
  if match:
151
165
  return match.group(1)
152
- match = re.search(r'([A-Z])', ans)
166
+
167
+ # Capture single uppercase letter
168
+ match = re.search(r'\b([A-Z])\b', ans)
153
169
  if match:
154
170
  return match.group(1)
171
+
155
172
  return ans
156
173
 
157
174
  @classmethod
158
175
  def _extract_ff_answer(cls, ans: str):
159
176
  """
160
- Extract the answer from the model output for Free-form task.
177
+ Extract the normalized answer for BBH free-form tasks.
178
+ Handles patterns like:
179
+ - "answer is XXX."
180
+ - "The answer is **valid**."
181
+ - Extra trailing dots / line breaks.
182
+ - Bold-marked answers (**xxx**).
183
+ Always uses the *last* occurrence of "answer is".
161
184
  """
162
- pattern = r'answer is\s+(.*?)\.'
185
+ ans = ans.strip()
163
186
 
164
- match = re.search(pattern, ans)
165
- if match:
166
- res = match.group(1)
167
- return res
187
+ parts = ans.split('So the answer is ')
188
+ if len(parts) > 1:
189
+ ans = parts[-1].strip()
190
+ ans = ans.split('\n')[0].strip()
168
191
 
169
- ans_line = ans.split('answer is ')
170
- if len(ans_line) != 1:
171
- ans = ans_line[1].strip()
172
- ans = ans.split('\n')[0]
192
+ # Remove trailing period
173
193
  if ans.endswith('.'):
174
- ans = ans[:-1]
194
+ ans = ans[:-1].strip()
195
+
196
+ # If answer is in bold (**xxx**), prefer the content inside
197
+ match = re.search(r'\*\*(.*?)\*\*', ans)
198
+ if match:
199
+ ans = match.group(1).strip()
200
+
175
201
  return ans