evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. evalscope/api/benchmark/__init__.py +9 -1
  2. evalscope/api/benchmark/adapters/__init__.py +4 -0
  3. evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  7. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  8. evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
  9. evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
  10. evalscope/api/benchmark/benchmark.py +85 -2
  11. evalscope/api/benchmark/meta.py +10 -1
  12. evalscope/api/dataset/dataset.py +27 -6
  13. evalscope/api/dataset/loader.py +8 -3
  14. evalscope/api/evaluator/cache.py +31 -4
  15. evalscope/api/evaluator/evaluator.py +5 -0
  16. evalscope/api/evaluator/state.py +17 -1
  17. evalscope/api/messages/__init__.py +1 -0
  18. evalscope/api/messages/chat_message.py +52 -2
  19. evalscope/api/metric/__init__.py +1 -1
  20. evalscope/api/metric/metric.py +6 -1
  21. evalscope/api/metric/scorer.py +15 -7
  22. evalscope/api/mixin/__init__.py +1 -1
  23. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  24. evalscope/api/mixin/sandbox_mixin.py +182 -0
  25. evalscope/api/model/generate_config.py +10 -6
  26. evalscope/api/model/model.py +5 -2
  27. evalscope/api/tool/tool_info.py +1 -1
  28. evalscope/app/app.py +3 -0
  29. evalscope/app/ui/multi_model.py +6 -1
  30. evalscope/app/ui/single_model.py +11 -5
  31. evalscope/app/utils/data_utils.py +8 -7
  32. evalscope/app/utils/env_utils.py +12 -0
  33. evalscope/app/utils/text_utils.py +14 -12
  34. evalscope/app/utils/visualization.py +2 -2
  35. evalscope/arguments.py +8 -4
  36. evalscope/backend/opencompass/backend_manager.py +0 -2
  37. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  38. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  39. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  40. evalscope/benchmarks/aime/aime24_adapter.py +5 -0
  41. evalscope/benchmarks/aime/aime25_adapter.py +136 -1
  42. evalscope/benchmarks/aime/grader.py +307 -0
  43. evalscope/benchmarks/aime/math_normalize.py +189 -0
  44. evalscope/benchmarks/amc/amc_adapter.py +51 -0
  45. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
  46. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  47. evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
  48. evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
  49. evalscope/benchmarks/bfcl/v3/utils.py +23 -0
  50. evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
  51. evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
  52. evalscope/benchmarks/bfcl/v4/utils.py +410 -0
  53. evalscope/benchmarks/biomix_qa/__init__.py +0 -0
  54. evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
  55. evalscope/benchmarks/blink/__init__.py +0 -0
  56. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  57. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  58. evalscope/benchmarks/chartqa/__init__.py +0 -0
  59. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  60. evalscope/benchmarks/chartqa/utils.py +38 -0
  61. evalscope/benchmarks/coin_flip/__init__.py +0 -0
  62. evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
  63. evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
  64. evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
  65. evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
  66. evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
  67. evalscope/benchmarks/docvqa/__init__.py +0 -0
  68. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  69. evalscope/benchmarks/drivelology/__init__.py +0 -0
  70. evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
  71. evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
  72. evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
  73. evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
  74. evalscope/benchmarks/drop/drop_adapter.py +15 -44
  75. evalscope/benchmarks/drop/utils.py +97 -0
  76. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  77. evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
  78. evalscope/benchmarks/general_arena/utils.py +2 -1
  79. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  80. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  81. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
  82. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  83. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
  84. evalscope/benchmarks/halu_eval/__init__.py +0 -0
  85. evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
  86. evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
  87. evalscope/benchmarks/healthbench/__init__.py +0 -0
  88. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  89. evalscope/benchmarks/healthbench/utils.py +102 -0
  90. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  91. evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
  92. evalscope/benchmarks/humaneval/utils.py +235 -0
  93. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  94. evalscope/benchmarks/image_edit/__init__.py +0 -0
  95. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  96. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  97. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  98. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  99. evalscope/benchmarks/infovqa/__init__.py +0 -0
  100. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  101. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  102. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
  103. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  104. evalscope/benchmarks/logi_qa/__int__.py +0 -0
  105. evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
  106. evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
  107. evalscope/benchmarks/math_qa/__init__.py +0 -0
  108. evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
  109. evalscope/benchmarks/math_verse/__init__.py +0 -0
  110. evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
  111. evalscope/benchmarks/math_vision/__init__.py +0 -0
  112. evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
  113. evalscope/benchmarks/math_vista/__init__.py +0 -0
  114. evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
  115. evalscope/benchmarks/med_mcqa/__init__.py +0 -0
  116. evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
  117. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  118. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
  119. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  120. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  121. evalscope/benchmarks/mm_star/__init__.py +0 -0
  122. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  123. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  124. evalscope/benchmarks/mmmu/__init__.py +0 -0
  125. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  126. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  127. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
  128. evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
  129. evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
  130. evalscope/benchmarks/multi_if/__init__.py +0 -0
  131. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  132. evalscope/benchmarks/multi_if/metrics.py +120 -0
  133. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  134. evalscope/benchmarks/music_trivia/__init__.py +0 -0
  135. evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
  136. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
  137. evalscope/benchmarks/ner/__init__.py +0 -0
  138. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  139. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  140. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  141. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  142. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  143. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  144. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  145. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  146. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  147. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  148. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  149. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  150. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  151. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  152. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  153. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  154. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  155. evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
  156. evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
  157. evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
  158. evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
  159. evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
  160. evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  161. evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
  162. evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
  163. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  164. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  165. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  166. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
  167. evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
  168. evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
  169. evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
  170. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  171. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  172. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  173. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  174. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  175. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  176. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  177. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  178. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  179. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  180. evalscope/benchmarks/piqa/__init__.py +0 -0
  181. evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
  182. evalscope/benchmarks/poly_math/__init__.py +0 -0
  183. evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
  184. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  185. evalscope/benchmarks/pope/__init__.py +0 -0
  186. evalscope/benchmarks/pope/pope_adapter.py +112 -0
  187. evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
  188. evalscope/benchmarks/pumed_qa/__init__.py +0 -0
  189. evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
  190. evalscope/benchmarks/qasc/__init__.py +0 -0
  191. evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
  192. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  193. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  194. evalscope/benchmarks/sciq/__init__.py +0 -0
  195. evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
  196. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  197. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  198. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
  199. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  200. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  201. evalscope/benchmarks/siqa/__init__.py +0 -0
  202. evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
  203. evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
  204. evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
  205. evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
  206. evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
  207. evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
  208. evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
  209. evalscope/benchmarks/text2image/__init__.py +0 -0
  210. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  211. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  212. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  213. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  214. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  215. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
  216. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  217. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  218. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  219. evalscope/benchmarks/wmt/__init__.py +0 -0
  220. evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
  221. evalscope/benchmarks/zerobench/__init__.py +0 -0
  222. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  223. evalscope/cli/start_app.py +7 -1
  224. evalscope/cli/start_perf.py +7 -1
  225. evalscope/config.py +103 -18
  226. evalscope/constants.py +18 -0
  227. evalscope/evaluator/evaluator.py +138 -82
  228. evalscope/metrics/bert_score/__init__.py +0 -0
  229. evalscope/metrics/bert_score/scorer.py +338 -0
  230. evalscope/metrics/bert_score/utils.py +697 -0
  231. evalscope/metrics/llm_judge.py +19 -7
  232. evalscope/metrics/math_parser.py +14 -0
  233. evalscope/metrics/metric.py +317 -13
  234. evalscope/metrics/metrics.py +37 -0
  235. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  236. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  237. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  238. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  239. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  240. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  241. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  242. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  243. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  244. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  245. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  246. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  247. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  248. evalscope/models/image_edit_model.py +125 -0
  249. evalscope/models/model_apis.py +22 -0
  250. evalscope/models/openai_compatible.py +21 -0
  251. evalscope/models/text2image_model.py +2 -2
  252. evalscope/models/utils/openai.py +16 -6
  253. evalscope/perf/arguments.py +26 -4
  254. evalscope/perf/benchmark.py +76 -89
  255. evalscope/perf/http_client.py +31 -16
  256. evalscope/perf/main.py +15 -2
  257. evalscope/perf/plugin/api/base.py +9 -7
  258. evalscope/perf/plugin/api/custom_api.py +13 -58
  259. evalscope/perf/plugin/api/default_api.py +188 -79
  260. evalscope/perf/plugin/api/openai_api.py +85 -20
  261. evalscope/perf/plugin/datasets/base.py +21 -0
  262. evalscope/perf/plugin/datasets/custom.py +2 -3
  263. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  264. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  265. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  266. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  267. evalscope/perf/plugin/datasets/openqa.py +2 -4
  268. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  269. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  270. evalscope/perf/utils/benchmark_util.py +43 -27
  271. evalscope/perf/utils/db_util.py +14 -19
  272. evalscope/perf/utils/local_server.py +3 -44
  273. evalscope/perf/utils/log_utils.py +21 -6
  274. evalscope/report/__init__.py +13 -3
  275. evalscope/report/combinator.py +91 -20
  276. evalscope/report/generator.py +8 -87
  277. evalscope/report/report.py +8 -4
  278. evalscope/run.py +13 -5
  279. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  280. evalscope/utils/argument_utils.py +1 -1
  281. evalscope/utils/chat_service.py +1 -1
  282. evalscope/utils/function_utils.py +249 -12
  283. evalscope/utils/import_utils.py +73 -1
  284. evalscope/utils/io_utils.py +132 -7
  285. evalscope/utils/json_schema.py +25 -2
  286. evalscope/utils/logger.py +69 -18
  287. evalscope/utils/model_utils.py +4 -3
  288. evalscope/utils/multi_choices.py +39 -7
  289. evalscope/utils/ner.py +377 -0
  290. evalscope/version.py +2 -2
  291. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
  292. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
  293. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
  294. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
  295. evalscope/api/mixin/dataset_mixin.py +0 -105
  296. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  297. tests/__init__.py +0 -1
  298. tests/aigc/__init__.py +0 -1
  299. tests/aigc/test_t2i.py +0 -142
  300. tests/benchmark/__init__.py +0 -1
  301. tests/benchmark/test_eval.py +0 -386
  302. tests/cli/__init__.py +0 -1
  303. tests/cli/test_all.py +0 -229
  304. tests/cli/test_collection.py +0 -96
  305. tests/cli/test_custom.py +0 -268
  306. tests/perf/__init__.py +0 -1
  307. tests/perf/test_perf.py +0 -176
  308. tests/rag/test_clip_benchmark.py +0 -90
  309. tests/rag/test_mteb.py +0 -213
  310. tests/rag/test_ragas.py +0 -128
  311. tests/swift/__init__.py +0 -1
  312. tests/swift/test_run_swift_eval.py +0 -146
  313. tests/swift/test_run_swift_vlm_eval.py +0 -128
  314. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  315. tests/test_run_all.py +0 -12
  316. tests/utils.py +0 -13
  317. tests/vlm/__init__.py +0 -1
  318. tests/vlm/test_vlmeval.py +0 -102
  319. /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
  320. /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
  321. /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
  322. {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
  323. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
  324. {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/utils/logger.py CHANGED
@@ -28,22 +28,41 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
28
28
  logging.getLogger('httpx').setLevel(logging.WARNING)
29
29
  logging.getLogger('modelscope').setLevel(logging.ERROR)
30
30
 
31
+ info_set = set()
32
+ warning_set = set()
33
+
34
+
35
+ def info_once(self, msg, *args, **kwargs):
36
+ hash_id = kwargs.get('hash_id') or msg
37
+ if hash_id in info_set:
38
+ return
39
+ info_set.add(hash_id)
40
+ self.info(msg)
41
+
42
+
43
+ def warning_once(self, msg, *args, **kwargs):
44
+ hash_id = kwargs.get('hash_id') or msg
45
+ if hash_id in warning_set:
46
+ return
47
+ warning_set.add(hash_id)
48
+ self.warning(msg)
49
+
31
50
 
32
51
  def get_logger(
33
52
  log_file: Optional[str] = None,
34
53
  name: Optional[str] = None,
35
54
  log_level: int = DEFAULT_LEVEL,
36
55
  file_mode: str = 'w',
37
- force=False
56
+ force: bool = False,
38
57
  ):
39
58
  """Get logging logger
40
59
 
41
60
  Args:
42
- log_file: Log filename, if specified, file handler will be added to
43
- logger
44
- log_level: Logging level.
45
- file_mode: Specifies the mode to open the file, if filename is
46
- specified (if filemode is unspecified, it defaults to 'w').
61
+ log_file: Log filename. If specified, a file handler will be added to the logger.
62
+ name: Logical component name. Used to derive the logger name.
63
+ log_level: Logging level to set.
64
+ file_mode: Mode to open the file when log_file is provided (default 'w').
65
+ force: If True, reconfigure the existing logger (levels, formatters, handlers).
47
66
  """
48
67
 
49
68
  if name:
@@ -58,7 +77,7 @@ def get_logger(
58
77
  logger.setLevel(log_level)
59
78
  for handler in logger.handlers:
60
79
  handler.setLevel(log_level)
61
- # 区分不同类型的 handler,使用相应的格式化器
80
+ # Select formatter by handler type
62
81
  if isinstance(handler, logging.FileHandler):
63
82
  handler.setFormatter(
64
83
  plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
@@ -67,6 +86,7 @@ def get_logger(
67
86
  handler.setFormatter(
68
87
  color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
69
88
  )
89
+ # Ensure file handler points to current log_file (replace if needed)
70
90
  add_file_handler_if_needed(logger, log_file, file_mode, log_level)
71
91
  return logger
72
92
 
@@ -88,7 +108,7 @@ def get_logger(
88
108
  handlers = [stream_handler]
89
109
 
90
110
  if is_worker0 and log_file is not None:
91
- file_handler = logging.FileHandler(log_file, file_mode)
111
+ file_handler = logging.FileHandler(log_file, file_mode, encoding='utf-8')
92
112
  handlers.append(file_handler)
93
113
 
94
114
  for handler in handlers:
@@ -118,23 +138,54 @@ def configure_logging(debug: bool, log_file: Optional[str] = None):
118
138
  get_logger(log_level=logging.DEBUG, force=True)
119
139
 
120
140
 
121
- def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
122
- for handler in logger.handlers:
123
- if isinstance(handler, logging.FileHandler):
124
- return
141
+ def add_file_handler_if_needed(
142
+ logger: logging.Logger,
143
+ log_file: Optional[str],
144
+ file_mode: str,
145
+ log_level: int,
146
+ ) -> None:
147
+ """Ensure logger has a FileHandler targeting log_file.
148
+ - If no FileHandler exists, add one.
149
+ - If a FileHandler exists but points to a different file, replace it.
150
+ """
151
+ if log_file is None:
152
+ return
125
153
 
154
+ # Only worker-0 writes files
126
155
  if iutil.find_spec('torch') is not None:
127
156
  from modelscope.utils.torch_utils import is_master
128
-
129
157
  is_worker0 = is_master()
130
158
  else:
131
159
  is_worker0 = True
132
160
 
133
- if is_worker0 and log_file is not None:
134
- file_handler = logging.FileHandler(log_file, file_mode)
135
- file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
136
- file_handler.setLevel(log_level)
137
- logger.addHandler(file_handler)
161
+ if not is_worker0:
162
+ return
163
+
164
+ target_path = os.path.abspath(log_file)
165
+ existing_file_handlers = [h for h in logger.handlers if isinstance(h, logging.FileHandler)]
166
+
167
+ # If there is a FileHandler already pointing to the target file, nothing to do.
168
+ for fh in existing_file_handlers:
169
+ try:
170
+ if os.path.abspath(getattr(fh, 'baseFilename', '')) == target_path:
171
+ return
172
+ except Exception:
173
+ # If any issue retrieving baseFilename, fall through to replacement
174
+ pass
175
+
176
+ # Replace all existing FileHandlers with the new one
177
+ for fh in existing_file_handlers:
178
+ try:
179
+ logger.removeHandler(fh)
180
+ fh.flush()
181
+ fh.close()
182
+ except Exception:
183
+ pass
184
+
185
+ file_handler = logging.FileHandler(target_path, file_mode, encoding='utf-8')
186
+ file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
187
+ file_handler.setLevel(log_level)
188
+ logger.addHandler(file_handler)
138
189
 
139
190
 
140
191
  def warn_once(logger: Logger, message: str) -> None:
@@ -3,6 +3,8 @@ import random
3
3
  from enum import Enum
4
4
  from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
5
5
 
6
+ from evalscope.utils.import_utils import check_import
7
+
6
8
  if TYPE_CHECKING:
7
9
  from transformers import GenerationConfig
8
10
 
@@ -67,7 +69,8 @@ def seed_everything(seed: int):
67
69
  """
68
70
  random.seed(seed)
69
71
  np.random.seed(seed)
70
- try:
72
+
73
+ if check_import('torch', raise_warning=False):
71
74
  import torch
72
75
 
73
76
  torch.manual_seed(seed)
@@ -75,5 +78,3 @@ def seed_everything(seed: int):
75
78
  torch.cuda.manual_seed_all(seed)
76
79
  torch.backends.cudnn.deterministic = True
77
80
  torch.backends.cudnn.benchmark = False
78
- except ImportError:
79
- pass
@@ -1,11 +1,8 @@
1
1
  # flake8: noqa: E501
2
- from __future__ import annotations
3
-
4
2
  import re
5
- from typing import TYPE_CHECKING, List, Optional
3
+ from typing import List, Optional, Union
6
4
 
7
- if TYPE_CHECKING:
8
- from evalscope.api.evaluator import Choices, Target, TaskState
5
+ from evalscope.api.evaluator import Choices, Target, TaskState
9
6
 
10
7
  FEW_SHOT_TEMPLATE = r"""Here are some examples of how to answer similar questions:
11
8
 
@@ -84,10 +81,27 @@ def answer_options(choices: Choices) -> str:
84
81
  return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
85
82
 
86
83
 
87
- def prompt(question: str, choices: Choices, template: str, fewshot: Optional[str] = None) -> str:
84
+ def format_letter_choices(choices: Union[Choices, List[str]]) -> str:
85
+ """
86
+ Returns the `choices` formatted as a letter list, e.g.:
87
+
88
+ ["choice 1", "choice 2", "choice 3"] ->
89
+ "A,B,C"
90
+ """
91
+ if isinstance(choices, list):
92
+ choices = Choices(choices)
93
+
94
+ indexes = list(range(len(choices)))
95
+
96
+ return ','.join([f'{answer_character(i)}' for i in indexes])
97
+
98
+
99
+ def prompt(question: str, choices: Union[Choices, List[str]], template: str, fewshot: Optional[str] = None) -> str:
100
+ if isinstance(choices, list):
101
+ choices = Choices(choices)
88
102
 
89
103
  choices_text = answer_options(choices)
90
- letters = ','.join(answer_character(i) for i in range(len(choices)))
104
+ letters = format_letter_choices(choices)
91
105
  if not fewshot:
92
106
  return template.format(
93
107
  choices=choices_text,
@@ -122,6 +136,14 @@ def format_example(
122
136
  return f'{question}\n{choices_text}\nANSWER: {answer.text}'
123
137
 
124
138
 
139
+ def _fallback_parse_answer(completion: str) -> Optional[set[str]]:
140
+ # Fallback to find the last upper case letter
141
+ for letter in reversed(completion):
142
+ if letter.isupper():
143
+ return {letter}
144
+ return None
145
+
146
+
125
147
  def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
126
148
  """
127
149
  Convenience function for extracting answers from the state output.
@@ -150,6 +172,11 @@ def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
150
172
  state.output.completion,
151
173
  )
152
174
 
175
+ if match is None:
176
+ fallback_answer = _fallback_parse_answer(state.output.completion)
177
+ if fallback_answer:
178
+ return fallback_answer
179
+
153
180
  if match is None:
154
181
  return set()
155
182
 
@@ -200,6 +227,11 @@ def parse_answers_zh(state: TaskState, multiple_correct: bool = False) -> set[st
200
227
  pattern = r'答案\s*[::]\s*([A-Za-z0-9,,]+)'
201
228
  match = re.search(pattern, state.output.completion, flags=re.MULTILINE)
202
229
 
230
+ if match is None:
231
+ fallback_answer = _fallback_parse_answer(state.output.completion)
232
+ if fallback_answer:
233
+ return fallback_answer
234
+
203
235
  if match is None:
204
236
  return set()
205
237
 
evalscope/utils/ner.py ADDED
@@ -0,0 +1,377 @@
1
+ import re
2
+ from typing import Any, Dict, List, Set, Tuple
3
+
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ logger = get_logger()
7
+
8
+ PROMPT_TEMPLATE = """
9
+ You are a named entity recognition system that identifies the following entity types:
10
+ {entities}
11
+
12
+ Process the provided text and mark all named entities with XML-style tags.
13
+
14
+ For example:
15
+ <person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
16
+
17
+ Available entity tags: {entity_list}
18
+
19
+ INSTRUCTIONS:
20
+ 1. Wrap your entire response in <response>...</response> tags.
21
+ 2. Inside these tags, include the original text with entity tags inserted.
22
+ 3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
23
+ 4. Tag ALL entities you can identify using the exact tag names provided.
24
+ 5. Do not include explanations, just the tagged text.
25
+ 6. If entity spans overlap, choose the most specific entity type.
26
+ 7. Ensure every opening tag has a matching closing tag.
27
+
28
+ Text to process:
29
+ {text}
30
+ """.lstrip()
31
+
32
+ FEWSHOT_TEMPLATE = """
33
+ Here are some examples of named entity recognition:
34
+
35
+ {fewshot}
36
+
37
+ You are a named entity recognition system that identifies the following entity types:
38
+ {entities}
39
+
40
+ Process the provided text and mark all named entities with XML-style tags.
41
+
42
+ For example:
43
+ <person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
44
+
45
+ Available entity tags: {entity_list}
46
+
47
+ INSTRUCTIONS:
48
+ 1. Wrap your entire response in <response>...</response> tags.
49
+ 2. Inside these tags, include the original text with entity tags inserted.
50
+ 3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
51
+ 4. Tag ALL entities you can identify using the exact tag names provided.
52
+ 5. Do not include explanations, just the tagged text.
53
+ 6. If entity spans overlap, choose the most specific entity type.
54
+ 7. Ensure every opening tag has a matching closing tag.
55
+
56
+ Text to process:
57
+ {text}
58
+ """.lstrip()
59
+
60
+ # Common error patterns to handle in XML predictions
61
+ DEFAULT_TAG_FIX_PATTERNS = [
62
+ # Fix mismatched tags
63
+ (r'<(\w+)>(.*?)</\w+>', r'<\1>\2</\1>'),
64
+ ]
65
+
66
+
67
+ def create_target_text(tokens: List[str], ner_tags: List[str], entity_type_map: Dict[str, str]) -> str:
68
+ """
69
+ Create annotated text from tokens and NER tags.
70
+ Handles BIO tagging scheme conversion to inline XML-style tags.
71
+
72
+ Args:
73
+ tokens: List of text tokens
74
+ ner_tags: List of BIO tags corresponding to tokens
75
+ entity_type_map: Mapping from BIO entity types to user-friendly tag names
76
+
77
+ Returns:
78
+ String with XML-style entity markup wrapped in <response> tags
79
+ """
80
+ result = []
81
+ current_entity = None
82
+ entity_tokens = []
83
+
84
+ for i, (token, tag) in enumerate(zip(tokens, ner_tags)):
85
+ if tag.startswith('B-'): # Beginning of entity
86
+ # Close previous entity if exists
87
+ if current_entity:
88
+ entity_type = entity_type_map.get(current_entity, '')
89
+ if entity_type:
90
+ result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
91
+ else:
92
+ result.append(' '.join(entity_tokens))
93
+ entity_tokens = []
94
+
95
+ current_entity = tag[2:] # Remove B- prefix
96
+ entity_tokens.append(token)
97
+ elif tag.startswith('I-') and current_entity and tag[2:] == current_entity: # Inside entity
98
+ entity_tokens.append(token)
99
+ else: # Outside any entity (O tag)
100
+ if current_entity: # Close previous entity
101
+ entity_type = entity_type_map.get(current_entity, '')
102
+ if entity_type:
103
+ result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
104
+ else:
105
+ result.append(' '.join(entity_tokens))
106
+ current_entity = None
107
+ entity_tokens = []
108
+
109
+ result.append(token)
110
+
111
+ # Handle any remaining entity at end of sequence
112
+ if current_entity:
113
+ entity_type = entity_type_map.get(current_entity, '')
114
+ if entity_type:
115
+ result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
116
+ else:
117
+ result.append(' '.join(entity_tokens))
118
+
119
+ # Wrap the entire response in <response> tags as required by the pipeline
120
+ return f'<response>{" ".join(result)}</response>'
121
+
122
+
123
+ def clean_prediction(text: str, tag_fix_patterns: List[Tuple[str, str]] = None) -> str:
124
+ """
125
+ Clean and fix common XML errors in model predictions.
126
+
127
+ Args:
128
+ text: The prediction text to clean
129
+ tag_fix_patterns: List of regex patterns and replacements to fix common XML errors
130
+
131
+ Returns:
132
+ Cleaned text with fixed XML tags
133
+ """
134
+ if tag_fix_patterns is None:
135
+ tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
136
+
137
+ cleaned = text
138
+
139
+ # Extract content from response tags if present
140
+ response_match = re.search(r'<response>(.*?)</response>', cleaned, re.DOTALL)
141
+ if response_match:
142
+ cleaned = response_match.group(1)
143
+
144
+ # Apply fix patterns for common XML errors
145
+ for pattern, replacement in tag_fix_patterns:
146
+ cleaned = re.sub(pattern, replacement, cleaned)
147
+
148
+ return cleaned
149
+
150
+
151
+ def extract_entities_from_text(text: str, reverse_entity_map: Dict[str, str]) -> List[Tuple]:
152
+ """
153
+ Extract entities from tagged text with robust error handling.
154
+
155
+ Args:
156
+ text: Text with XML entity tags
157
+ reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
158
+
159
+ Returns:
160
+ List of (entity_type, entity_text, start_idx, end_idx) tuples
161
+ """
162
+ entities = []
163
+
164
+ # Define regex pattern to find XML-style entity tags - handle potential errors
165
+ pattern = r'<(\w+)>(.*?)</\1>'
166
+
167
+ try:
168
+ for match in re.finditer(pattern, text):
169
+ entity_type = match.group(1).lower() # Normalize type to lowercase
170
+ entity_text = match.group(2)
171
+ start_idx = match.start()
172
+ end_idx = match.end()
173
+
174
+ # Map back to entity types if possible
175
+ mapped_type = reverse_entity_map.get(entity_type)
176
+
177
+ if mapped_type:
178
+ entities.append((mapped_type, entity_text, start_idx, end_idx))
179
+ else:
180
+ # Unknown entity type but still count it for evaluation
181
+ entities.append((entity_type, entity_text, start_idx, end_idx))
182
+
183
+ except Exception as e:
184
+ logger.warning(f'Error parsing entities in text: {str(e)}')
185
+
186
+ # Handle malformed XML by trying to find additional tag patterns
187
+ # This is a fallback for when the model produces incorrect tags
188
+ unclosed_pattern = r'<(\w+)>(.*?)(?=<|$)'
189
+ try:
190
+ # Find potential unclosed tags
191
+ for match in re.finditer(unclosed_pattern, text):
192
+ # Skip if already part of a well-formed tag
193
+ if any(start_idx <= match.start() < end_idx for _, _, start_idx, end_idx in entities):
194
+ continue
195
+
196
+ entity_type = match.group(1).lower()
197
+ entity_text = match.group(2)
198
+ start_idx = match.start()
199
+ end_idx = match.end()
200
+
201
+ # Map back to entity types
202
+ mapped_type = reverse_entity_map.get(entity_type)
203
+ if mapped_type:
204
+ entities.append((mapped_type, entity_text, start_idx, end_idx))
205
+
206
+ except Exception as e:
207
+ logger.warning(f'Error handling malformed tags: {str(e)}')
208
+
209
+ return entities
210
+
211
+
212
+ def xml_to_bio_tags(xml_text: str, original_tokens: List[str], reverse_entity_map: Dict[str, str]) -> List[str]:
213
+ """
214
+ Convert XML-annotated text back to BIO tags aligned with the original tokens.
215
+
216
+ Args:
217
+ xml_text: Text with XML entity annotations
218
+ original_tokens: Original tokens to align with
219
+ reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
220
+
221
+ Returns:
222
+ List of BIO tags corresponding to the original tokens
223
+ """
224
+ # Extract entities with their character positions
225
+ entities = extract_entities_from_text(xml_text, reverse_entity_map)
226
+
227
+ # Initialize all tags as 'O'
228
+ bio_tags = ['O'] * len(original_tokens)
229
+
230
+ # Reconstruct the original text to find character positions for each token
231
+ original_text = ' '.join(original_tokens)
232
+
233
+ # Track token start positions in the original text
234
+ token_positions = []
235
+ pos = 0
236
+ for token in original_tokens:
237
+ token_pos = original_text.find(token, pos)
238
+ if token_pos == -1:
239
+ # Fallback: just use the current position if we can't find the exact match
240
+ token_positions.append(pos)
241
+ else:
242
+ token_positions.append(token_pos)
243
+ pos = token_pos + len(token)
244
+
245
+ # Add token end positions
246
+ token_ends = [pos + len(token) for pos, token in zip(token_positions, original_tokens)]
247
+
248
+ # Map entities to tokens based on character positions
249
+ for entity_type, entity_text, start_pos, end_pos in entities:
250
+ # Extract the context from the XML text to help locate the correct entity occurrence
251
+ # Get some context before and after the entity in the XML text
252
+ context_start = max(0, start_pos - 20)
253
+ context_end = min(len(xml_text), end_pos + 20)
254
+
255
+ # Extract context without XML tags
256
+ context_before = re.sub(r'<[^>]+>', '', xml_text[context_start:start_pos])
257
+ context_after = re.sub(r'<[^>]+>', '', xml_text[end_pos:context_end])
258
+
259
+ # Use context to find the correct entity position in original text
260
+ search_pos = 0
261
+ entity_start = -1
262
+
263
+ while search_pos < len(original_text):
264
+ # Find the next occurrence of the entity
265
+ potential_start = original_text.find(entity_text, search_pos)
266
+ if potential_start == -1:
267
+ break
268
+
269
+ # Check if the context matches
270
+ potential_context_start = max(0, potential_start - len(context_before))
271
+ potential_context_end = min(len(original_text), potential_start + len(entity_text) + len(context_after))
272
+
273
+ before_match = context_before.strip() in original_text[potential_context_start:potential_start].strip()
274
+ after_match = context_after.strip() in original_text[potential_start
275
+ + len(entity_text):potential_context_end].strip()
276
+
277
+ # If context matches or we can't find a better match, use this position
278
+ if before_match or after_match or search_pos > len(original_text) // 2:
279
+ entity_start = potential_start
280
+ break
281
+
282
+ # Move search position forward
283
+ search_pos = potential_start + 1
284
+
285
+ # If we couldn't find the entity with context, fall back to the first occurrence
286
+ if entity_start == -1:
287
+ entity_start = original_text.find(entity_text)
288
+ if entity_start == -1:
289
+ continue
290
+
291
+ entity_end = entity_start + len(entity_text)
292
+
293
+ # Find tokens that overlap with this entity
294
+ for i, (token_start, token_end) in enumerate(zip(token_positions, token_ends)):
295
+ if token_start <= entity_end and token_end >= entity_start:
296
+ # This token overlaps with the entity
297
+ if bio_tags[i] == 'O':
298
+ # Start of entity
299
+ if i == 0 or bio_tags[i - 1] == 'O' or not bio_tags[i - 1].endswith(entity_type):
300
+ bio_tags[i] = f'B-{entity_type}'
301
+ else:
302
+ # Continuation of entity
303
+ bio_tags[i] = f'I-{entity_type}'
304
+
305
+ return bio_tags
306
+
307
+
308
+ def calculate_bio_metrics(pred_tags: List[str], gold_tags: List[str], tokens: List[str]) -> Tuple[int, int, int]:
309
+ """
310
+ Calculate metrics by comparing BIO tag sequences.
311
+
312
+ Args:
313
+ pred_tags: Predicted BIO tags
314
+ gold_tags: Gold standard BIO tags
315
+ tokens: Original tokens
316
+
317
+ Returns:
318
+ Tuple of (true_positives, false_positives, false_negatives)
319
+ """
320
+ # Extract entity spans from BIO tags
321
+ pred_spans = extract_spans_from_bio(pred_tags, tokens)
322
+ gold_spans = extract_spans_from_bio(gold_tags, tokens)
323
+
324
+ # Calculate metrics
325
+ true_positives = len(pred_spans.intersection(gold_spans))
326
+ false_positives = len(pred_spans - gold_spans)
327
+ false_negatives = len(gold_spans - pred_spans)
328
+
329
+ return true_positives, false_positives, false_negatives
330
+
331
+
332
+ def extract_spans_from_bio(tags: List[str], tokens: List[str]) -> Set[Tuple]:
333
+ """
334
+ Extract entity spans from BIO tags.
335
+
336
+ Args:
337
+ tags: List of BIO tags
338
+ tokens: List of tokens corresponding to the tags
339
+
340
+ Returns:
341
+ Set of (entity_type, start_idx, end_idx, text) tuples
342
+ """
343
+ spans = set()
344
+ current_entity = None
345
+ start_idx = None
346
+ entity_tokens = []
347
+
348
+ for i, (token, tag) in enumerate(zip(tokens, tags)):
349
+ if tag.startswith('B-'): # Beginning of entity
350
+ # Close previous entity if exists
351
+ if current_entity:
352
+ entity_type = current_entity
353
+ entity_text = ' '.join(entity_tokens)
354
+ spans.add((entity_type, start_idx, i - 1, entity_text))
355
+ entity_tokens = []
356
+
357
+ current_entity = tag[2:] # Remove B- prefix
358
+ start_idx = i
359
+ entity_tokens.append(token)
360
+ elif tag.startswith('I-') and current_entity: # Inside entity
361
+ entity_tokens.append(token)
362
+ elif tag == 'O': # Outside any entity
363
+ if current_entity: # Close previous entity
364
+ entity_type = current_entity
365
+ entity_text = ' '.join(entity_tokens)
366
+ spans.add((entity_type, start_idx, i - 1, entity_text))
367
+ current_entity = None
368
+ start_idx = None
369
+ entity_tokens = []
370
+
371
+ # Handle any remaining entity at end of sequence
372
+ if current_entity:
373
+ entity_type = current_entity
374
+ entity_text = ' '.join(entity_tokens)
375
+ spans.add((entity_type, start_idx, len(tokens) - 1, entity_text))
376
+
377
+ return spans
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '1.0.0'
4
- __release_datetime__ = '2025-08-25 12:00:00'
3
+ __version__ = '1.2.0'
4
+ __release_datetime__ = '2025-11-11 12:00:00'