evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,10 +1,9 @@
1
- import importlib
2
1
  import json
3
2
  import re
4
3
  import traceback
5
- from typing import Any, Dict
4
+ from typing import Any, Dict, List
6
5
 
7
- from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
8
7
  from evalscope.api.dataset import Sample
9
8
  from evalscope.api.evaluator import TaskState
10
9
  from evalscope.api.messages.chat_message import ChatMessageUser
@@ -12,6 +11,8 @@ from evalscope.api.metric import Score
12
11
  from evalscope.api.model import Model, ModelOutput
13
12
  from evalscope.api.registry import register_benchmark
14
13
  from evalscope.constants import Tags
14
+ from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
15
+ from evalscope.utils.import_utils import check_import
15
16
  from evalscope.utils.logger import get_logger
16
17
 
17
18
  logger = get_logger()
@@ -36,19 +37,25 @@ SUBJECT_MAPPING = {
36
37
  'multi_turn_long_context': 'MULTI_TURN'
37
38
  }
38
39
 
40
+ BFCL_V3_TO_V4_SUBJECT_MAPPING = {
41
+ 'simple': 'simple_python',
42
+ 'java': 'simple_java',
43
+ 'javascript': 'simple_javascript',
44
+ }
45
+
39
46
 
40
47
  @register_benchmark(
41
48
  BenchmarkMeta(
42
49
  name='bfcl_v3',
43
50
  pretty_name='BFCL-v3',
44
- tags=[Tags.FUNCTION_CALLING],
51
+ tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
45
52
  description='Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive '
46
53
  'and executable function call evaluation** '
47
54
  'dedicated to assessing Large Language Models\' (LLMs) ability to invoke '
48
55
  'functions. Unlike previous evaluations, '
49
56
  'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
50
- 'Need to run `pip install bfcl-eval==2025.6.16` before evaluating. '
51
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)',
57
+ 'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
58
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html)',
52
59
  dataset_id='AI-ModelScope/bfcl_v3',
53
60
  subset_list=list(SUBJECT_MAPPING.keys()),
54
61
  metric_list=['acc'],
@@ -59,7 +66,7 @@ SUBJECT_MAPPING = {
59
66
  }
60
67
  )
61
68
  )
62
- class BFCLAdapter(DefaultDataAdapter):
69
+ class BFCLV3Adapter(AgentAdapter):
63
70
  """
64
71
  BFCL adapter using the new data processing framework.
65
72
  """
@@ -67,14 +74,12 @@ class BFCLAdapter(DefaultDataAdapter):
67
74
  def __init__(self, **kwargs):
68
75
  super().__init__(**kwargs)
69
76
 
70
- spec = importlib.util.find_spec('bfcl_eval')
71
- if spec is None:
72
- raise ImportError(
73
- '`bfcl_eval` not found, please install it with `pip install bfcl-eval==2025.6.16` before evaluating.'
74
- )
77
+ check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
75
78
 
76
79
  self.category_map = SUBJECT_MAPPING
77
80
  self.reformat_subset = True
81
+ self.add_overall_metric = False
82
+ self.add_aggregation_name = False
78
83
 
79
84
  self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
80
85
  self.is_fc_model = self.extra_params.get('is_fc_model', True)
@@ -108,8 +113,8 @@ class BFCLAdapter(DefaultDataAdapter):
108
113
  record['turns'] = new_turns
109
114
 
110
115
  return Sample(
111
- input=[ChatMessageUser(content='')],
112
- target='', # Will use the record for evaluation
116
+ input=[ChatMessageUser(content=json.dumps(record['turns']))],
117
+ target=json.dumps(record['ground_truth']), # Will use the record for evaluation
113
118
  subset_key=record['subset'],
114
119
  metadata=record # Store the full record for evaluation
115
120
  )
@@ -130,6 +135,8 @@ class BFCLAdapter(DefaultDataAdapter):
130
135
  )
131
136
  from bfcl_eval.utils import is_empty_output
132
137
 
138
+ from .utils import convert_format_language, convert_language
139
+
133
140
  score = Score(
134
141
  extracted_prediction=filtered_prediction,
135
142
  prediction=original_prediction,
@@ -143,7 +150,7 @@ class BFCLAdapter(DefaultDataAdapter):
143
150
  dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
144
151
 
145
152
  row = task_state.metadata
146
- test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
153
+ test_category = BFCL_V3_TO_V4_SUBJECT_MAPPING.get(row['test_category'], row['test_category'])
147
154
 
148
155
  if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
149
156
  error = None
@@ -155,7 +162,9 @@ class BFCLAdapter(DefaultDataAdapter):
155
162
  params = tool_call[name]
156
163
  decoded_tool_calls.append({name: params})
157
164
  else:
158
- decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
165
+ decoded_tool_calls = default_decode_ast_prompting(
166
+ row['generation'][0][0], convert_format_language(row['language'])
167
+ )
159
168
 
160
169
  # successful decode means valid function call was present
161
170
  contains_func_call = True
@@ -220,14 +229,16 @@ class BFCLAdapter(DefaultDataAdapter):
220
229
  params = tool_call[name]
221
230
  decoded_tool_calls.append({name: params})
222
231
  else:
223
- decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
232
+ decoded_tool_calls = default_decode_ast_prompting(
233
+ row['generation'][0][0], convert_format_language(row['language'])
234
+ )
224
235
 
225
236
  score_result = ast_checker(
226
237
  row['functions'],
227
238
  decoded_tool_calls,
228
239
  row['ground_truth'],
229
- row['language'],
230
- row['test_category'],
240
+ convert_language(row['language']),
241
+ test_category,
231
242
  dummy_model,
232
243
  )
233
244
  except Exception:
@@ -256,3 +267,104 @@ class BFCLAdapter(DefaultDataAdapter):
256
267
  score.metadata = {'error': traceback.format_exc()}
257
268
  score.main_score_name = 'acc'
258
269
  return score
270
+
271
+ def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
272
+ """
273
+ Finalize the report generation process. Calculate the overall score.
274
+
275
+ Track the number of each category.
276
+ - step1: simple, java, javascript unweighted average as simple_ast
277
+ - step2.1: simple_ast, multiple, parallel, parallel_multiple unweighted average as ast_non_live
278
+ - step2.2: live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average as ast_live
279
+ - step2.3: irrelevance as hallucination_non_live
280
+ - step2.4: live_irrelevance, live_relevance weighted average as hallucination_live
281
+ - step2.5: multi_turn_base as multi_turn_base
282
+ - step2.6: multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average as multi_turn_augmented
283
+ - step3.1: ast_non_live, hallucination_non_live unweighted average as non_live
284
+ - step3.2: ast_live, hallucination_live weighted average as live
285
+ - step3.3: multi_turn_base, multi_turn_augmented unweighted average as multi_turn
286
+ - step4: non_live, live, multi_turn unweighted average as overall
287
+ Args:
288
+ report (Report): The generated evaluation report.
289
+ output_dir (str): The directory to save the report.
290
+
291
+ Returns:
292
+ None
293
+ """ # noqa: E501
294
+ for metric in report.metrics:
295
+ # Collect all subsets in a dictionary for easy access
296
+ subset_dict: Dict[str, Subset] = {}
297
+ for category in metric.categories:
298
+ for subset in category.subsets:
299
+ subset_dict[subset.name] = subset
300
+
301
+ # Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
302
+ simple_subsets = ['simple', 'java', 'javascript']
303
+ simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
304
+ subset_dict['simple_ast'] = simple_ast
305
+
306
+ # Step 2.1: Calculate ast_non_live
307
+ # (simple_ast, multiple, parallel, parallel_multiple unweighted average)
308
+ ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
309
+ ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
310
+ subset_dict['ast_non_live'] = ast_non_live
311
+
312
+ # Step 2.2: Calculate ast_live
313
+ # (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
314
+ live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
315
+ ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
316
+ subset_dict['ast_live'] = ast_live
317
+
318
+ # Step 2.3: hallucination_non_live (irrelevance)
319
+ if 'irrelevance' in subset_dict:
320
+ subset_dict['hallucination_non_live'] = subset_dict['irrelevance']
321
+ else:
322
+ subset_dict['hallucination_non_live'] = Subset(name='hallucination_non_live', score=0, num=0)
323
+
324
+ # Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
325
+ hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
326
+ hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
327
+ subset_dict['hallucination_live'] = hallucination_live
328
+
329
+ # Step 2.5: multi_turn_base
330
+ if 'multi_turn_base' not in subset_dict:
331
+ subset_dict['multi_turn_base'] = Subset(name='multi_turn_base', score=0, num=0)
332
+
333
+ # Step 2.6: Calculate multi_turn_augmented
334
+ # (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
335
+ multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
336
+ multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
337
+ subset_dict['multi_turn_augmented'] = multi_turn_augmented
338
+
339
+ # Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
340
+ non_live_subsets = ['ast_non_live', 'hallucination_non_live']
341
+ non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
342
+ subset_dict['non_live'] = non_live
343
+
344
+ # Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
345
+ live_agg_subsets = ['ast_live', 'hallucination_live']
346
+ live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
347
+ subset_dict['live'] = live
348
+
349
+ # Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
350
+ multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
351
+ multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
352
+ subset_dict['multi_turn'] = multi_turn
353
+
354
+ # Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
355
+ overall_subsets = ['non_live', 'live', 'multi_turn']
356
+ overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
357
+ subset_dict['overall'] = overall
358
+
359
+ # Add computed scores to the category
360
+ computed_subset_names = ['non_live', 'live', 'multi_turn', 'overall']
361
+
362
+ # Add the computed scores as new subsets in the metric
363
+ dummy_subsets = []
364
+ for subset_name in computed_subset_names:
365
+ if subset_name in subset_dict:
366
+ subset = subset_dict[subset_name]
367
+ subset.name = subset_name.upper()
368
+ dummy_subsets.append(subset)
369
+ dummy_category = Category(name='-', subsets=dummy_subsets)
370
+ metric.categories.append(dummy_category)
@@ -72,13 +72,14 @@ def generate_turn(model: Model, row: dict[str, Any]):
72
72
 
73
73
  # Handle the response based on the model output structure
74
74
  message = model_output.message
75
- model_usage += model_output.usage
75
+ if model_output.usage is not None:
76
+ model_usage += model_output.usage
76
77
 
77
78
  current_messages.append(message)
78
79
  if isinstance(message, str):
79
80
  result = message
80
81
  else:
81
- result = message.content
82
+ result = message.text
82
83
 
83
84
  logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
84
85
  current_responses.append(result)
@@ -115,7 +116,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
115
116
 
116
117
  n_steps += 1
117
118
  if n_steps > MAXIMUM_STEP_LIMIT:
118
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
119
120
  break
120
121
 
121
122
  all_model_responses.append(current_responses)
@@ -145,9 +146,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
145
146
  new_tools = row['missing_functions'][str(turn_idx)]
146
147
  for new_tool in new_tools:
147
148
  cur_tool = new_tool[0]
148
- # change type to object
149
- if cur_tool['parameters']['type'] != 'object':
150
- cur_tool['parameters']['type'] = 'object'
149
+ cur_tool['parameters']['type'] = 'object'
151
150
  tools.append({
152
151
  'type': 'function',
153
152
  'function': cur_tool,
@@ -172,7 +171,8 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
172
171
 
173
172
  # Handle the response based on the model output structure
174
173
  message = model_output.message
175
- model_usage += model_output.usage
174
+ if model_output.usage is not None:
175
+ model_usage += model_output.usage
176
176
 
177
177
  current_messages.append(message)
178
178
  if isinstance(message, str):
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
186
186
  logger.error(f'Error converting tool calls to function call strings: {e}')
187
187
  tool_call_strs = None
188
188
  else:
189
- model_responses = [message.content]
189
+ model_responses = [message.text]
190
190
  tool_call_strs = None
191
191
 
192
192
  current_responses.extend(model_responses)
@@ -214,7 +214,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
214
214
 
215
215
  n_steps += 1
216
216
  if n_steps > MAXIMUM_STEP_LIMIT:
217
- logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
217
+ logger.warning(f'INFERENCE_WARNING: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
218
218
  break
219
219
 
220
220
  all_model_responses.append(current_responses)
@@ -0,0 +1,23 @@
1
+ def convert_language(language: str) -> str:
2
+ """Convert language names from BFCL v3 to BFCL v4 naming conventions."""
3
+ from bfcl_eval.constants.enums import Language
4
+ mapping = {
5
+ 'python': Language.PYTHON,
6
+ 'java': Language.JAVA,
7
+ 'javascript': Language.JAVASCRIPT,
8
+ }
9
+ return mapping[language.lower()]
10
+
11
+
12
+ def convert_format_language(format_language: str) -> str:
13
+ """Convert format language names from BFCL v3 to BFCL v4 naming conventions."""
14
+ from bfcl_eval.constants.enums import ReturnFormat
15
+ mapping = {
16
+ 'python': ReturnFormat.PYTHON,
17
+ 'java': ReturnFormat.JAVA,
18
+ 'javascript': ReturnFormat.JAVASCRIPT,
19
+ 'json': ReturnFormat.JSON,
20
+ 'verbose_xml': ReturnFormat.VERBOSE_XML,
21
+ 'concise_xml': ReturnFormat.CONCISE_XML,
22
+ }
23
+ return mapping[format_language.lower()]
File without changes
@@ -0,0 +1,229 @@
1
+ import json
2
+ import os
3
+ import traceback
4
+ from copy import deepcopy
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List
7
+
8
+ from evalscope.api.benchmark import AgentAdapter, BenchmarkMeta
9
+ from evalscope.api.dataset import Sample
10
+ from evalscope.api.dataset.dataset import DatasetDict
11
+ from evalscope.api.dataset.loader import DictDataLoader
12
+ from evalscope.api.evaluator import TaskState
13
+ from evalscope.api.messages.chat_message import ChatMessageUser
14
+ from evalscope.api.metric import Score
15
+ from evalscope.api.model import Model, ModelOutput
16
+ from evalscope.api.registry import register_benchmark
17
+ from evalscope.constants import Tags
18
+ from evalscope.report import Report
19
+ from evalscope.utils.function_utils import thread_safe
20
+ from evalscope.utils.import_utils import check_import
21
+ from evalscope.utils.logger import get_logger
22
+ from .utils import (
23
+ ALL_SCORING_CATEGORIES,
24
+ compute_aggregate_subsets,
25
+ compute_entry_result,
26
+ load_bfcl_data,
27
+ process_test_entries,
28
+ run_prereq_inference,
29
+ )
30
+
31
+ logger = get_logger()
32
+
33
+
34
+ @register_benchmark(
35
+ BenchmarkMeta(
36
+ name='bfcl_v4',
37
+ pretty_name='BFCL-v4',
38
+ tags=[Tags.FUNCTION_CALLING, Tags.AGENT],
39
+ description='With function-calling being the building blocks of Agents, '
40
+ 'the Berkeley Function-Calling Leaderboard (BFCL) V4 presents a holistic agentic '
41
+ 'evaluation for LLMs. BFCL V4 Agentic includes web search, memory, and format sensitivity. '
42
+ 'Together, the ability to web search, read and write from memory, and the ability to invoke '
43
+ 'functions in different languages present the building blocks for the exciting and extremely '
44
+ 'challenging avenues that power agentic LLMs today from deep-research, to agents for coding and law. '
45
+ 'Need to run `pip install bfcl-eval==2025.10.27.1` before evaluating. '
46
+ '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v4.html)',
47
+ dataset_id='https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard',
48
+ subset_list=ALL_SCORING_CATEGORIES,
49
+ metric_list=['acc'],
50
+ eval_split='train',
51
+ extra_params={
52
+ 'underscore_to_dot': True,
53
+ 'is_fc_model': True,
54
+ 'SERPAPI_API_KEY': None,
55
+ }
56
+ )
57
+ )
58
+ class BFCLV4Adapter(AgentAdapter):
59
+ """
60
+ BFCL adapter using the new data processing framework.
61
+ """
62
+
63
+ def __init__(self, **kwargs):
64
+ super().__init__(**kwargs)
65
+
66
+ check_import('bfcl_eval', package='bfcl-eval==2025.10.27.1', raise_error=True, feature_name=self.pretty_name)
67
+
68
+ self.add_overall_metric = False
69
+ self.add_aggregation_name = False
70
+
71
+ self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
72
+ self.is_fc_model = self.extra_params.get('is_fc_model', True)
73
+ # Set SERPAPI_API_KEY in environment variables if provided
74
+ serpapi_api_key = self.extra_params.get('SERPAPI_API_KEY', None)
75
+ if serpapi_api_key:
76
+ os.environ['SERPAPI_API_KEY'] = serpapi_api_key
77
+ self.model_result_dir = Path(self._task_config.work_dir) if self._task_config else Path('./bfcl_model_results')
78
+ self.handler = None
79
+ self.prereq_entries = []
80
+ self.prereq_finished = False
81
+
82
+ def load(self):
83
+ """Load and process the BFCL dataset."""
84
+ from bfcl_eval.utils import parse_test_category_argument
85
+ datasets = {}
86
+ all_test_categories = parse_test_category_argument(self.subset_list)
87
+
88
+ test_entries_by_cat, ground_truth_by_cat = load_bfcl_data(all_test_categories)
89
+
90
+ for category in all_test_categories:
91
+ test_entries = test_entries_by_cat.get(category, [])
92
+ ground_truth_entries = ground_truth_by_cat.get(category, [])
93
+
94
+ if not test_entries:
95
+ continue
96
+
97
+ datasets[category] = self._create_dataset_for_category(category, test_entries, ground_truth_entries)
98
+
99
+ test_dataset = DatasetDict(datasets)
100
+ return test_dataset, None
101
+
102
+ def _create_dataset_for_category(
103
+ self, category: str, test_entries: List[Dict], ground_truth_entries: List[Dict]
104
+ ) -> DatasetDict:
105
+ """Create a dataset for a single category by merging test and ground truth data."""
106
+ processed_entries, prereq_entries = process_test_entries(
107
+ category=category,
108
+ test_entries=test_entries,
109
+ ground_truth_entries=ground_truth_entries,
110
+ model_result_dir=self.model_result_dir,
111
+ )
112
+ # collect prereq entries for later prereq inference
113
+ self.prereq_entries.extend(prereq_entries)
114
+
115
+ return DictDataLoader(
116
+ dict_list=processed_entries,
117
+ limit=self.limit,
118
+ repeats=self.repeats,
119
+ sample_fields=self.record_to_sample,
120
+ shuffle=self.shuffle,
121
+ ).load()
122
+
123
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
124
+ """Convert a data record to a Sample object."""
125
+ return Sample(
126
+ input=[ChatMessageUser(content=json.dumps(record['question']))],
127
+ target=json.dumps(record['ground_truth']), # Will use the record for evaluation
128
+ metadata=record # Store the full record for evaluation
129
+ )
130
+
131
+ @thread_safe
132
+ def _init_handler(self):
133
+ if self.handler is not None:
134
+ return # Handler already initialized
135
+
136
+ from bfcl_eval.model_handler.api_inference.openai_completion import OpenAICompletionsHandler
137
+
138
+ # Set env variables for OpenAI API
139
+ os.environ['OPENAI_API_KEY'] = self._task_config.api_key
140
+ os.environ['OPENAI_BASE_URL'] = self._task_config.api_url
141
+
142
+ self.handler = OpenAICompletionsHandler(
143
+ model_name=self._task_config.model,
144
+ temperature=self._task_config.generation_config.temperature,
145
+ registry_name=self._task_config.model_id,
146
+ is_fc_model=self.is_fc_model,
147
+ )
148
+
149
+ self._prereq_inference()
150
+
151
+ def _prereq_inference(self):
152
+ if self.prereq_finished:
153
+ return
154
+ # MOVED: delegate prereq processing to utils
155
+ run_prereq_inference(
156
+ handler=self.handler,
157
+ prereq_entries=self.prereq_entries,
158
+ model_result_dir=self.model_result_dir,
159
+ batch_size=self._task_config.eval_batch_size,
160
+ logger=logger,
161
+ )
162
+ self.prereq_finished = True
163
+
164
+ def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
165
+ try:
166
+ self._init_handler()
167
+
168
+ result, _ = self.handler.inference(
169
+ deepcopy(sample.metadata), include_input_log=False, exclude_state_log=False
170
+ )
171
+
172
+ output = ModelOutput.from_content(
173
+ model=model.name,
174
+ content=json.dumps(result),
175
+ )
176
+ except Exception as e:
177
+ # This is usually the case when the model getting stuck on one particular test case.
178
+ # For example, timeout error or FC model returning invalid JSON response.
179
+ # Since temperature is already set to 0.001, retrying the same test case will not help.
180
+ # So we continue the generation process and record the error message as the model response
181
+ logger.error(f'Error during inference for sample ID {sample.metadata.get("id")}: {e}')
182
+ logger.error(traceback.format_exc())
183
+
184
+ output = ModelOutput.from_content(
185
+ model=model.name,
186
+ content=json.dumps({
187
+ 'error': str(e),
188
+ 'error_message': traceback.format_exc(),
189
+ }),
190
+ )
191
+ return output
192
+
193
+ def match_score(
194
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
195
+ ) -> Score:
196
+ self._init_handler()
197
+
198
+ score = Score(
199
+ extracted_prediction=filtered_prediction,
200
+ prediction=original_prediction,
201
+ )
202
+ model_result = json.loads(filtered_prediction)
203
+ prompt = task_state.metadata
204
+
205
+ entry_result = compute_entry_result(
206
+ handler=self.handler,
207
+ model_result=model_result,
208
+ prompt_entry=prompt,
209
+ underscore_to_dot=self.underscore_to_dot,
210
+ )
211
+
212
+ valid = 1 if entry_result['valid'] else 0
213
+ score.value = {'acc': valid}
214
+ score.metadata = {
215
+ 'valid': bool(entry_result.get('valid')),
216
+ 'error': str(entry_result.get('error')),
217
+ 'error_message': str(entry_result.get('error_message')),
218
+ 'error_type': str(entry_result.get('error_type')),
219
+ }
220
+ return score
221
+
222
+ def _on_generate_report_end(self, report: Report, output_dir, **kwargs):
223
+ """
224
+ Finalize the report generation process. Calculate the overall score.
225
+ """
226
+
227
+ # noqa: E501
228
+ # MOVED: delegate aggregation logic to utils
229
+ compute_aggregate_subsets(report)