evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,15 @@
1
+ import os
1
2
  from itertools import product
2
3
  from tqdm import tqdm
3
- from typing import TYPE_CHECKING, List, Union
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Union
4
5
 
5
- from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import AnswerKeys, EvalType
7
- from evalscope.metrics import LLMJudge, exact_match
8
- from evalscope.metrics.metrics import mean
9
- from evalscope.utils import get_logger
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import DatasetDict, DictDataLoader, MemoryDataset, Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.metric import Score
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.logger import get_logger
10
13
 
11
14
  if TYPE_CHECKING:
12
15
  from evalscope.report import Report
@@ -26,55 +29,66 @@ PROMPT_TEMPLATE = """Please read the following text and answer the question belo
26
29
  Don't give information outside the document or repeat your findings."""
27
30
 
28
31
 
29
- @Benchmark.register(
30
- name='needle_haystack',
31
- pretty_name='Needle-in-a-Haystack',
32
- tags=['Retrieval', 'Long Context'],
33
- description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
34
- 'It requires the model to find specific information within a large corpus of text. '
35
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
36
- dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
37
- metric_list=['AverageAccuracy'],
38
- subset_list=['english', 'chinese'],
39
- few_shot_num=0,
40
- train_split=None,
41
- eval_split='test',
42
- system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
43
- prompt_template=PROMPT_TEMPLATE,
44
- extra_params={
45
- 'retrieval_question': 'What is the best thing to do in San Francisco?',
46
- 'needles':
47
- ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
48
- 'context_lengths_min': 1000,
49
- 'context_lengths_max': 32000,
50
- 'context_lengths_num_intervals': 10,
51
- 'document_depth_percent_min': 0,
52
- 'document_depth_percent_max': 100,
53
- 'document_depth_percent_intervals': 10,
54
- 'tokenizer_path': 'Qwen/Qwen3-0.6B',
55
- 'show_score': False,
56
- })
57
- class NeedleHaystackAdapter(DataAdapter):
32
+ @register_benchmark(
33
+ BenchmarkMeta(
34
+ name='needle_haystack',
35
+ pretty_name='Needle-in-a-Haystack',
36
+ tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
37
+ description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
38
+ 'It requires the model to find specific information within a large corpus of text. '
39
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
40
+ dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
41
+ metric_list=['acc'],
42
+ subset_list=['english', 'chinese'],
43
+ eval_split='test',
44
+ system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
45
+ prompt_template=PROMPT_TEMPLATE,
46
+ extra_params={
47
+ 'retrieval_question':
48
+ 'What is the best thing to do in San Francisco?',
49
+ 'needles':
50
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
51
+ 'context_lengths_min':
52
+ 1000,
53
+ 'context_lengths_max':
54
+ 32000,
55
+ 'context_lengths_num_intervals':
56
+ 10,
57
+ 'document_depth_percent_min':
58
+ 0,
59
+ 'document_depth_percent_max':
60
+ 100,
61
+ 'document_depth_percent_intervals':
62
+ 10,
63
+ 'tokenizer_path':
64
+ 'Qwen/Qwen3-0.6B',
65
+ 'show_score':
66
+ False,
67
+ }
68
+ )
69
+ )
70
+ class NeedleHaystackAdapter(DefaultDataAdapter):
58
71
 
59
72
  def __init__(self, **kwargs):
60
73
  super().__init__(**kwargs)
61
74
 
62
- self.llm_as_a_judge = True
75
+ self._use_llm_judge = True
63
76
  # set extra params
64
- extra_params = kwargs.get('extra_params', {})
65
- self.retrieval_question = extra_params.get('retrieval_question',
66
- 'What is the best thing to do in San Francisco?')
67
- self.needles = extra_params.get(
77
+ self.retrieval_question = self.extra_params.get(
78
+ 'retrieval_question', 'What is the best thing to do in San Francisco?'
79
+ )
80
+ self.needles = self.extra_params.get(
68
81
  'needles',
69
- ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'])
70
- self.context_lengths_min = extra_params.get('context_lengths_min', 1000)
71
- self.context_lengths_max = extra_params.get('context_lengths_max', 32000)
72
- self.context_lengths_num_intervals = extra_params.get('context_lengths_num_intervals', 10)
73
- self.document_depth_percent_min = extra_params.get('document_depth_percent_min', 0)
74
- self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
75
- self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
76
- self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
77
- self.show_score = extra_params.get('show_score', False)
82
+ ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
83
+ )
84
+ self.context_lengths_min = self.extra_params.get('context_lengths_min', 1000)
85
+ self.context_lengths_max = self.extra_params.get('context_lengths_max', 32000)
86
+ self.context_lengths_num_intervals = self.extra_params.get('context_lengths_num_intervals', 10)
87
+ self.document_depth_percent_min = self.extra_params.get('document_depth_percent_min', 0)
88
+ self.document_depth_percent_max = self.extra_params.get('document_depth_percent_max', 100)
89
+ self.document_depth_percent_intervals = self.extra_params.get('document_depth_percent_intervals', 10)
90
+ self.tokenizer_path = self.extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
91
+ self.show_score = self.extra_params.get('show_score', False)
78
92
 
79
93
  self._init_tokenizer()
80
94
  self._init_length()
@@ -88,65 +102,93 @@ class NeedleHaystackAdapter(DataAdapter):
88
102
  self.context_lengths_min,
89
103
  self.context_lengths_max,
90
104
  num=self.context_lengths_num_intervals,
91
- endpoint=True)).astype(int)
105
+ endpoint=True
106
+ )
107
+ ).astype(int)
92
108
 
93
109
  self.document_depth_percents = np.round(
94
110
  np.linspace(
95
111
  self.document_depth_percent_min,
96
112
  self.document_depth_percent_max,
97
113
  num=self.document_depth_percent_intervals,
98
- endpoint=True)).astype(int)
114
+ endpoint=True
115
+ )
116
+ ).astype(int)
99
117
 
100
118
  def _init_tokenizer(self):
101
119
  """ Initialize the tokenizer based on the provided tokenizer path."""
102
120
  from modelscope import AutoTokenizer
103
121
  self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
104
122
 
105
- def load(self, **kwargs):
106
- # default load with snapshot
107
- kwargs['file_structure'] = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
108
- data_dict = super().load_with_snapshot(**kwargs)
109
- return data_dict
110
-
111
- def gen_prompts(self, data_dict: dict) -> dict:
112
- """
113
- Generate dataset prompts from raw input, unify the prompt format for different datasets.
114
-
115
- Args:
116
- data_dict: {'english': {'test': [sample_d_1, sample_d_2, ...]},
117
- 'chinese': {'test': [sample_d_1, sample_d_2, ...]}}
118
-
119
- Returns:
120
- {'subset_name': [prompt_d_1, prompt_d_2, ...]}
121
- prompt_d_i (dict): refer to the output of gen_prompt method.
122
-
123
- e.g. train -- few-shot data, test -- target dataset to evaluate.
124
- """
125
- res_dict: dict = {}
126
-
127
- for sub_name, sub_data_dict in data_dict.items():
128
- res_dict[sub_name] = []
129
- for sample_d in sub_data_dict[self.eval_split]:
130
- # Generate prompts for each sample in the dataset
131
- tokens_context = self._get_context_tokens(sample_d['text'])
123
+ def load(self):
124
+ """Load dataset from local disk or remote."""
125
+ dataset_name_or_path = self.dataset_id
126
+ if os.path.exists(dataset_name_or_path):
127
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
128
+ dataset_path = dataset_name_or_path
129
+ else:
130
+ from modelscope import dataset_snapshot_download
131
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
132
+ dataset_path = dataset_snapshot_download(
133
+ dataset_name_or_path, allow_file_pattern=['PaulGraham_Essays.txt', 'Journey_to_the_West.txt']
134
+ )
135
+
136
+ # Load datasets for both subsets
137
+ datasets = {}
138
+ file_structure = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
139
+
140
+ for subset_name, files in file_structure.items():
141
+ if subset_name not in self.subset_list:
142
+ continue
143
+ file_path = os.path.join(dataset_path, files[0])
144
+ if os.path.exists(file_path):
145
+ with open(file_path, 'r', encoding='utf-8') as f:
146
+ text = f.read()
147
+
148
+ # Generate samples for all combinations of context length and depth
149
+ records = []
150
+ tokens_context = self._get_context_tokens(text)
132
151
  for context_length, depth_percent in tqdm(
133
- product(self.context_lengths, self.document_depth_percents),
134
- desc=f'Generating {sub_name} prompts'):
135
- # Insert needles into the context at the specified depth percentage
152
+ product(self.context_lengths, self.document_depth_percents),
153
+ desc=f'Generating {subset_name} samples'
154
+ ):
136
155
  context = self._insert_needles(tokens_context, depth_percent, context_length)
137
- # Build the input dictionary for the prompt
138
- input_d = {
156
+ record = {
157
+ 'text': text,
139
158
  'context_length': int(context_length),
140
159
  'depth_percent': int(depth_percent),
141
160
  'question': self.retrieval_question,
142
161
  'answer': '\n'.join(self.needles),
143
162
  'context': context,
144
163
  }
145
- prompt_d = self.gen_prompt(input_d=input_d)
146
- prompt_d[AnswerKeys.RAW_INPUT] = input_d
147
- res_dict[sub_name].append(prompt_d)
148
-
149
- return res_dict
164
+ records.append(record)
165
+
166
+ dataset = DictDataLoader(
167
+ dict_list=records, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
168
+ ).load()
169
+
170
+ datasets[subset_name] = dataset
171
+
172
+ test_dataset = DatasetDict(datasets)
173
+ return test_dataset, None
174
+
175
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
176
+ """Convert a data record to a Sample object."""
177
+ return Sample(
178
+ input=record['question'],
179
+ target=record['answer'],
180
+ metadata={
181
+ 'context': record['context'],
182
+ 'context_length': record['context_length'],
183
+ 'depth_percent': record['depth_percent'],
184
+ }
185
+ )
186
+
187
+ def format_prompt_template(self, sample):
188
+ """Format the prompt template with context and question."""
189
+ context = sample.metadata['context']
190
+ question = sample.input
191
+ return self.prompt_template.format(context=context, question=question)
150
192
 
151
193
  def _get_context_tokens(self, input_context: str) -> list:
152
194
  """
@@ -227,7 +269,8 @@ class NeedleHaystackAdapter(DataAdapter):
227
269
  # We want to make sure that we place our needle at a sentence break
228
270
  # so we first see what token a '.' is
229
271
  period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
230
- '。') # Handle both English and Chinese periods
272
+ '。'
273
+ ) # Handle both English and Chinese periods
231
274
 
232
275
  # Then we iteration backwards until we find the first period
233
276
  while tokens_new_context and tokens_new_context[-1] not in period_tokens:
@@ -240,8 +283,10 @@ class NeedleHaystackAdapter(DataAdapter):
240
283
  # Log
241
284
  insertion_percentage = (insertion_point / len(tokens_context)) * 100
242
285
  self.insertion_percentages.append(insertion_percentage)
243
- logger.debug(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
244
- f'total length now: {len(tokens_context)} tokens')
286
+ logger.debug(
287
+ f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
288
+ f'total length now: {len(tokens_context)} tokens'
289
+ )
245
290
 
246
291
  # Adjust depth for next needle
247
292
  depth_percent += depth_percent_interval
@@ -249,84 +294,78 @@ class NeedleHaystackAdapter(DataAdapter):
249
294
  new_context = self.tokenizer.decode(tokens_context)
250
295
  return new_context
251
296
 
252
- def gen_prompt(self, input_d: dict, **kwargs) -> dict:
253
- """
254
- Generate the prompt for each sample in the dataset.
255
- Args:
256
- input_d: A dictionary containing the input data for the prompt.
257
- It should contain 'context' and optionally 'question'.
258
- Returns:
259
- A dictionary containing the prompt data
260
- """
261
- context = input_d.get('context')
262
- question = input_d.get('question')
297
+ def match_score(
298
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
299
+ ) -> Score:
300
+ """Calculate evaluation scores by comparing prediction with reference."""
301
+ from evalscope.metrics import exact_match
302
+ from .utils import normalize_answer
263
303
 
264
- prompt = self.prompt_template.format(context=context, question=question)
304
+ score = Score(
305
+ extracted_prediction=filtered_prediction,
306
+ prediction=original_prediction,
307
+ )
265
308
 
266
- return self.gen_prompt_data(prompt, system_prompt=self.system_prompt)
309
+ # Get metadata from task state
310
+ context_length = task_state.metadata.get('context_length', 0)
311
+ depth_percent = task_state.metadata.get('depth_percent', 0)
267
312
 
268
- def get_gold_answer(self, input_d: dict) -> str:
269
- """
270
- Parse the raw input labels (gold).
271
- """
272
- return input_d.get('answer', '').strip()
313
+ norm_gold = normalize_answer(reference)
314
+ norm_pred = normalize_answer(filtered_prediction)
315
+ accuracy = exact_match(gold=norm_gold, pred=norm_pred)
273
316
 
274
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
275
- """
276
- Parse the predicted result and extract proper answer.
277
- """
278
- return result
317
+ metric_name = f'Context#{context_length} Depth#{depth_percent}'
318
+ score.value = {metric_name: accuracy}
319
+ score.main_score_name = metric_name
279
320
 
280
- def match(self, gold: str, pred: str) -> float:
281
- """
282
- Match the gold answer and the predicted answer.
283
- """
284
- from .utils import normalize_answer
285
- norm_gold = normalize_answer(gold)
286
- norm_pred = normalize_answer(pred)
287
- # Use exact match for Needle in a Haystack
288
- return exact_match(gold=norm_gold, pred=norm_pred)
321
+ return score
289
322
 
290
- def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> dict:
291
- """
292
- Use LLM as a judge to evaluate the predicted answer against the gold answer.
293
- """
323
+ def llm_match_score(
324
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
325
+ ) -> Score:
326
+ """Use LLM as a judge to evaluate the predicted answer against the gold answer."""
294
327
  from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
295
328
 
296
- raw_input = kwargs.get('raw_input', None)
297
- question = raw_input.get('question')
298
- context_length = raw_input.get('context_length')
299
- depth_percent = raw_input.get('depth_percent')
329
+ score = Score(
330
+ extracted_prediction=filtered_prediction,
331
+ prediction=original_prediction,
332
+ )
300
333
 
301
- # get grading response
302
- prompt = ORM_USER_TEMPLATE.format(question=question, gold=gold, pred=pred)
303
- orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
334
+ # Get metadata from task state
335
+ context_length = task_state.metadata.get('context_length', 0)
336
+ depth_percent = task_state.metadata.get('depth_percent', 0)
337
+ question = task_state.input_text
304
338
 
305
- # parse grading score with regex, [[score]]
306
- score = parse_score(orm_response) if orm_response else 0.0
307
- return {f'Context#{context_length} Depth#{depth_percent}': score}
339
+ # Get grading response
340
+ prompt = ORM_USER_TEMPLATE.format(question=question, gold=reference, pred=filtered_prediction)
341
+ orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
308
342
 
309
- def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
310
- """
311
- compute weighted mean of the bleu score of all samples
343
+ # Parse grading score with regex, [[score]]
344
+ accuracy = parse_score(orm_response) if orm_response else 0.0
312
345
 
313
- Args:
314
- review_res_list: [score1, score2, ...]
346
+ metric_name = f'Context#{context_length} Depth#{depth_percent}'
347
+ score.value = {metric_name: accuracy}
348
+ score.explanation = f'LLM judge: {orm_response}'
349
+ score.metadata = {
350
+ 'source': 'llm_judge',
351
+ 'judge_strategy': getattr(self, 'judge_strategy', 'default'),
352
+ 'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown'
353
+ }
354
+ score.main_score_name = metric_name
315
355
 
316
- Returns:
317
- avg_res: List[dict]
356
+ return score
318
357
 
319
- """
320
- items = super().compute_dict_metric(review_res_list, **kwargs)
321
- return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
358
+ def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
359
+ # Don't add aggregation name for needle haystack adapter
360
+ return super()._on_generate_report(scores, model_name, False)
322
361
 
323
- def post_process_report(self, report: 'Report', **kwargs):
362
+ def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
324
363
  try:
325
364
  import os
326
365
 
327
366
  from .utils import draw_score_chat
328
367
 
329
- report_path = kwargs.get('report_path')
368
+ report_path = output_dir
330
369
  data_frame = report.to_dataframe()
331
370
  # split `Metric` to `Context` and `Depth`
332
371
  data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
@@ -336,13 +375,14 @@ class NeedleHaystackAdapter(DataAdapter):
336
375
  for subset in data_frame['Subset'].unique():
337
376
  sub_df = data_frame[data_frame['Subset'] == subset]
338
377
  # draw charts for each subset
339
- pivot_table = sub_df.pivot_table(
340
- values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
378
+ pivot_table = sub_df.pivot_table(values='Score', index=['Depth', 'Context'],
379
+ aggfunc='mean').reset_index()
341
380
  pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
342
381
  draw_score_chat(
343
382
  pivot_table,
344
383
  outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
345
- show_score=self.show_score)
384
+ show_score=self.show_score
385
+ )
346
386
 
347
387
  except Exception as e:
348
388
  logger.error(f'Error generating charts: {e}')