evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -1,12 +1,15 @@
1
+ import os
1
2
  from itertools import product
2
3
  from tqdm import tqdm
3
- from typing import TYPE_CHECKING, List, Union
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Union
4
5
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import AnswerKeys, EvalType
7
- from evalscope.metrics import LLMJudge, exact_match
8
- from evalscope.metrics.metrics import mean
9
- from evalscope.utils import get_logger
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import DatasetDict, DictDataLoader, MemoryDataset, Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.metric import Score
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.logger import get_logger
10
13
 
11
14
  if TYPE_CHECKING:
12
15
  from evalscope.report import Report
@@ -26,55 +29,66 @@ PROMPT_TEMPLATE = """Please read the following text and answer the question belo
26
29
  Don't give information outside the document or repeat your findings."""
27
30
 
28
31
 
29
- @Benchmark.register(
30
- name='needle_haystack',
31
- pretty_name='Needle-in-a-Haystack',
32
- tags=['Retrieval', 'Long Context'],
33
- description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
34
- 'It requires the model to find specific information within a large corpus of text. '
35
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
36
- dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
37
- metric_list=['AverageAccuracy'],
38
- subset_list=['english', 'chinese'],
39
- few_shot_num=0,
40
- train_split=None,
41
- eval_split='test',
42
- system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
43
- prompt_template=PROMPT_TEMPLATE,
44
- extra_params={
45
- 'retrieval_question': 'What is the best thing to do in San Francisco?',
46
- 'needles':
47
- ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
48
- 'context_lengths_min': 1000,
49
- 'context_lengths_max': 32000,
50
- 'context_lengths_num_intervals': 10,
51
- 'document_depth_percent_min': 0,
52
- 'document_depth_percent_max': 100,
53
- 'document_depth_percent_intervals': 10,
54
- 'tokenizer_path': 'Qwen/Qwen3-0.6B',
55
- 'show_score': False,
56
- })
57
- class NeedleHaystackAdapter(DataAdapter):
32
+ @register_benchmark(
33
+ BenchmarkMeta(
34
+ name='needle_haystack',
35
+ pretty_name='Needle-in-a-Haystack',
36
+ tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
37
+ description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
38
+ 'It requires the model to find specific information within a large corpus of text. '
39
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
40
+ dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
41
+ metric_list=['acc'],
42
+ subset_list=['english', 'chinese'],
43
+ eval_split='test',
44
+ system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
45
+ prompt_template=PROMPT_TEMPLATE,
46
+ extra_params={
47
+ 'retrieval_question':
48
+ 'What is the best thing to do in San Francisco?',
49
+ 'needles':
50
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
51
+ 'context_lengths_min':
52
+ 1000,
53
+ 'context_lengths_max':
54
+ 32000,
55
+ 'context_lengths_num_intervals':
56
+ 10,
57
+ 'document_depth_percent_min':
58
+ 0,
59
+ 'document_depth_percent_max':
60
+ 100,
61
+ 'document_depth_percent_intervals':
62
+ 10,
63
+ 'tokenizer_path':
64
+ 'Qwen/Qwen3-0.6B',
65
+ 'show_score':
66
+ False,
67
+ }
68
+ )
69
+ )
70
+ class NeedleHaystackAdapter(DefaultDataAdapter):
58
71
 
59
72
  def __init__(self, **kwargs):
60
73
  super().__init__(**kwargs)
61
74
 
62
- self.llm_as_a_judge = True
75
+ self._use_llm_judge = True
63
76
  # set extra params
64
- extra_params = kwargs.get('extra_params', {})
65
- self.retrieval_question = extra_params.get('retrieval_question',
66
- 'What is the best thing to do in San Francisco?')
67
- self.needles = extra_params.get(
77
+ self.retrieval_question = self.extra_params.get(
78
+ 'retrieval_question', 'What is the best thing to do in San Francisco?'
79
+ )
80
+ self.needles = self.extra_params.get(
68
81
  'needles',
69
- ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'])
70
- self.context_lengths_min = extra_params.get('context_lengths_min', 1000)
71
- self.context_lengths_max = extra_params.get('context_lengths_max', 32000)
72
- self.context_lengths_num_intervals = extra_params.get('context_lengths_num_intervals', 10)
73
- self.document_depth_percent_min = extra_params.get('document_depth_percent_min', 0)
74
- self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
75
- self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
76
- self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
77
- self.show_score = extra_params.get('show_score', False)
82
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
83
+ )
84
+ self.context_lengths_min = self.extra_params.get('context_lengths_min', 1000)
85
+ self.context_lengths_max = self.extra_params.get('context_lengths_max', 32000)
86
+ self.context_lengths_num_intervals = self.extra_params.get('context_lengths_num_intervals', 10)
87
+ self.document_depth_percent_min = self.extra_params.get('document_depth_percent_min', 0)
88
+ self.document_depth_percent_max = self.extra_params.get('document_depth_percent_max', 100)
89
+ self.document_depth_percent_intervals = self.extra_params.get('document_depth_percent_intervals', 10)
90
+ self.tokenizer_path = self.extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
91
+ self.show_score = self.extra_params.get('show_score', False)
78
92
 
79
93
  self._init_tokenizer()
80
94
  self._init_length()
@@ -88,65 +102,97 @@ class NeedleHaystackAdapter(DataAdapter):
88
102
  self.context_lengths_min,
89
103
  self.context_lengths_max,
90
104
  num=self.context_lengths_num_intervals,
91
- endpoint=True)).astype(int)
105
+ endpoint=True
106
+ )
107
+ ).astype(int)
92
108
 
93
109
  self.document_depth_percents = np.round(
94
110
  np.linspace(
95
111
  self.document_depth_percent_min,
96
112
  self.document_depth_percent_max,
97
113
  num=self.document_depth_percent_intervals,
98
- endpoint=True)).astype(int)
114
+ endpoint=True
115
+ )
116
+ ).astype(int)
99
117
 
100
118
  def _init_tokenizer(self):
101
119
  """ Initialize the tokenizer based on the provided tokenizer path."""
102
120
  from modelscope import AutoTokenizer
103
121
  self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
104
122
 
105
- def load(self, **kwargs):
106
- # default load with snapshot
107
- kwargs['file_structure'] = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
108
- data_dict = super().load_with_snapshot(**kwargs)
109
- return data_dict
110
-
111
- def gen_prompts(self, data_dict: dict) -> dict:
112
- """
113
- Generate dataset prompts from raw input, unify the prompt format for different datasets.
114
-
115
- Args:
116
- data_dict: {'english': {'test': [sample_d_1, sample_d_2, ...]},
117
- 'chinese': {'test': [sample_d_1, sample_d_2, ...]}}
118
-
119
- Returns:
120
- {'subset_name': [prompt_d_1, prompt_d_2, ...]}
121
- prompt_d_i (dict): refer to the output of gen_prompt method.
122
-
123
- e.g. train -- few-shot data, test -- target dataset to evaluate.
124
- """
125
- res_dict: dict = {}
126
-
127
- for sub_name, sub_data_dict in data_dict.items():
128
- res_dict[sub_name] = []
129
- for sample_d in sub_data_dict[self.eval_split]:
130
- # Generate prompts for each sample in the dataset
131
- tokens_context = self._get_context_tokens(sample_d['text'])
123
+ def load(self):
124
+ """Load dataset from local disk or remote."""
125
+ dataset_name_or_path = self.dataset_id
126
+ if os.path.exists(dataset_name_or_path):
127
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
128
+ dataset_path = dataset_name_or_path
129
+ else:
130
+ from modelscope import dataset_snapshot_download
131
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
132
+ dataset_path = dataset_snapshot_download(
133
+ dataset_name_or_path, allow_file_pattern=['PaulGraham_Essays.txt', 'Journey_to_the_West.txt']
134
+ )
135
+
136
+ # Load datasets for both subsets
137
+ datasets = {}
138
+ file_structure = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
139
+
140
+ for subset_name, files in file_structure.items():
141
+ if subset_name not in self.subset_list:
142
+ continue
143
+ file_path = os.path.join(dataset_path, files[0])
144
+ if os.path.exists(file_path):
145
+ with open(file_path, 'r', encoding='utf-8') as f:
146
+ text = f.read()
147
+
148
+ # Generate samples for all combinations of context length and depth
149
+ records = []
150
+ tokens_context = self._get_context_tokens(text)
132
151
  for context_length, depth_percent in tqdm(
133
- product(self.context_lengths, self.document_depth_percents),
134
- desc=f'Generating {sub_name} prompts'):
135
- # Insert needles into the context at the specified depth percentage
152
+ product(self.context_lengths, self.document_depth_percents),
153
+ desc=f'Generating {subset_name} samples'
154
+ ):
136
155
  context = self._insert_needles(tokens_context, depth_percent, context_length)
137
- # Build the input dictionary for the prompt
138
- input_d = {
156
+ record = {
157
+ 'text': text,
139
158
  'context_length': int(context_length),
140
159
  'depth_percent': int(depth_percent),
141
160
  'question': self.retrieval_question,
142
161
  'answer': '\n'.join(self.needles),
143
162
  'context': context,
144
163
  }
145
- prompt_d = self.gen_prompt(input_d=input_d)
146
- prompt_d[AnswerKeys.RAW_INPUT] = input_d
147
- res_dict[sub_name].append(prompt_d)
148
-
149
- return res_dict
164
+ records.append(record)
165
+
166
+ dataset = DictDataLoader(
167
+ dict_list=records,
168
+ limit=self.limit,
169
+ repeats=self.repeats,
170
+ sample_fields=self.record_to_sample,
171
+ shuffle=self.shuffle,
172
+ ).load()
173
+
174
+ datasets[subset_name] = dataset
175
+
176
+ test_dataset = DatasetDict(datasets)
177
+ return test_dataset, None
178
+
179
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
180
+ """Convert a data record to a Sample object."""
181
+ return Sample(
182
+ input=record['question'],
183
+ target=record['answer'],
184
+ metadata={
185
+ 'context': record['context'],
186
+ 'context_length': record['context_length'],
187
+ 'depth_percent': record['depth_percent'],
188
+ }
189
+ )
190
+
191
+ def format_prompt_template(self, sample):
192
+ """Format the prompt template with context and question."""
193
+ context = sample.metadata['context']
194
+ question = sample.input
195
+ return self.prompt_template.format(context=context, question=question)
150
196
 
151
197
  def _get_context_tokens(self, input_context: str) -> list:
152
198
  """
@@ -227,7 +273,8 @@ class NeedleHaystackAdapter(DataAdapter):
227
273
  # We want to make sure that we place our needle at a sentence break
228
274
  # so we first see what token a '.' is
229
275
  period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
230
- '。') # Handle both English and Chinese periods
276
+ '。'
277
+ ) # Handle both English and Chinese periods
231
278
 
232
279
  # Then we iteration backwards until we find the first period
233
280
  while tokens_new_context and tokens_new_context[-1] not in period_tokens:
@@ -240,8 +287,10 @@ class NeedleHaystackAdapter(DataAdapter):
240
287
  # Log
241
288
  insertion_percentage = (insertion_point / len(tokens_context)) * 100
242
289
  self.insertion_percentages.append(insertion_percentage)
243
- logger.debug(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
244
- f'total length now: {len(tokens_context)} tokens')
290
+ logger.debug(
291
+ f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
292
+ f'total length now: {len(tokens_context)} tokens'
293
+ )
245
294
 
246
295
  # Adjust depth for next needle
247
296
  depth_percent += depth_percent_interval
@@ -249,84 +298,78 @@ class NeedleHaystackAdapter(DataAdapter):
249
298
  new_context = self.tokenizer.decode(tokens_context)
250
299
  return new_context
251
300
 
252
- def gen_prompt(self, input_d: dict, **kwargs) -> dict:
253
- """
254
- Generate the prompt for each sample in the dataset.
255
- Args:
256
- input_d: A dictionary containing the input data for the prompt.
257
- It should contain 'context' and optionally 'question'.
258
- Returns:
259
- A dictionary containing the prompt data
260
- """
261
- context = input_d.get('context')
262
- question = input_d.get('question')
301
+ def match_score(
302
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
303
+ ) -> Score:
304
+ """Calculate evaluation scores by comparing prediction with reference."""
305
+ from evalscope.metrics import exact_match
306
+ from .utils import normalize_answer
263
307
 
264
- prompt = self.prompt_template.format(context=context, question=question)
308
+ score = Score(
309
+ extracted_prediction=filtered_prediction,
310
+ prediction=original_prediction,
311
+ )
265
312
 
266
- return self.gen_prompt_data(prompt, system_prompt=self.system_prompt)
313
+ # Get metadata from task state
314
+ context_length = task_state.metadata.get('context_length', 0)
315
+ depth_percent = task_state.metadata.get('depth_percent', 0)
267
316
 
268
- def get_gold_answer(self, input_d: dict) -> str:
269
- """
270
- Parse the raw input labels (gold).
271
- """
272
- return input_d.get('answer', '').strip()
317
+ norm_gold = normalize_answer(reference)
318
+ norm_pred = normalize_answer(filtered_prediction)
319
+ accuracy = exact_match(gold=norm_gold, pred=norm_pred)
273
320
 
274
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
275
- """
276
- Parse the predicted result and extract proper answer.
277
- """
278
- return result
321
+ metric_name = f'Context#{context_length} Depth#{depth_percent}'
322
+ score.value = {metric_name: accuracy}
323
+ score.main_score_name = metric_name
279
324
 
280
- def match(self, gold: str, pred: str) -> float:
281
- """
282
- Match the gold answer and the predicted answer.
283
- """
284
- from .utils import normalize_answer
285
- norm_gold = normalize_answer(gold)
286
- norm_pred = normalize_answer(pred)
287
- # Use exact match for Needle in a Haystack
288
- return exact_match(gold=norm_gold, pred=norm_pred)
325
+ return score
289
326
 
290
- def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> dict:
291
- """
292
- Use LLM as a judge to evaluate the predicted answer against the gold answer.
293
- """
327
+ def llm_match_score(
328
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
329
+ ) -> Score:
330
+ """Use LLM as a judge to evaluate the predicted answer against the gold answer."""
294
331
  from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
295
332
 
296
- raw_input = kwargs.get('raw_input', None)
297
- question = raw_input.get('question')
298
- context_length = raw_input.get('context_length')
299
- depth_percent = raw_input.get('depth_percent')
333
+ score = Score(
334
+ extracted_prediction=filtered_prediction,
335
+ prediction=original_prediction,
336
+ )
300
337
 
301
- # get grading response
302
- prompt = ORM_USER_TEMPLATE.format(question=question, gold=gold, pred=pred)
303
- orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
338
+ # Get metadata from task state
339
+ context_length = task_state.metadata.get('context_length', 0)
340
+ depth_percent = task_state.metadata.get('depth_percent', 0)
341
+ question = task_state.input_text
304
342
 
305
- # parse grading score with regex, [[score]]
306
- score = parse_score(orm_response) if orm_response else 0.0
307
- return {f'Context#{context_length} Depth#{depth_percent}': score}
343
+ # Get grading response
344
+ prompt = ORM_USER_TEMPLATE.format(question=question, gold=reference, pred=filtered_prediction)
345
+ orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
308
346
 
309
- def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
310
- """
311
- compute weighted mean of the bleu score of all samples
347
+ # Parse grading score with regex, [[score]]
348
+ accuracy = parse_score(orm_response) if orm_response else 0.0
312
349
 
313
- Args:
314
- review_res_list: [score1, score2, ...]
350
+ metric_name = f'Context#{context_length} Depth#{depth_percent}'
351
+ score.value = {metric_name: accuracy}
352
+ score.explanation = f'LLM judge: {orm_response}'
353
+ score.metadata = {
354
+ 'source': 'llm_judge',
355
+ 'judge_strategy': getattr(self, 'judge_strategy', 'default'),
356
+ 'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown'
357
+ }
358
+ score.main_score_name = metric_name
315
359
 
316
- Returns:
317
- avg_res: List[dict]
360
+ return score
318
361
 
319
- """
320
- items = super().compute_dict_metric(review_res_list, **kwargs)
321
- return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
362
+ def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
363
+ # Don't add aggregation name for needle haystack adapter
364
+ return super()._on_generate_report(scores, model_name, False)
322
365
 
323
- def post_process_report(self, report: 'Report', **kwargs):
366
+ def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
324
367
  try:
325
368
  import os
326
369
 
327
370
  from .utils import draw_score_chat
328
371
 
329
- report_path = kwargs.get('report_path')
372
+ report_path = output_dir
330
373
  data_frame = report.to_dataframe()
331
374
  # split `Metric` to `Context` and `Depth`
332
375
  data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
@@ -336,13 +379,14 @@ class NeedleHaystackAdapter(DataAdapter):
336
379
  for subset in data_frame['Subset'].unique():
337
380
  sub_df = data_frame[data_frame['Subset'] == subset]
338
381
  # draw charts for each subset
339
- pivot_table = sub_df.pivot_table(
340
- values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
382
+ pivot_table = sub_df.pivot_table(values='Score', index=['Depth', 'Context'],
383
+ aggfunc='mean').reset_index()
341
384
  pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
342
385
  draw_score_chat(
343
386
  pivot_table,
344
387
  outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
345
- show_score=self.show_score)
388
+ show_score=self.show_score
389
+ )
346
390
 
347
391
  except Exception as e:
348
392
  logger.error(f'Error generating charts: {e}')