evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,14 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import EvalType
3
- from evalscope.metrics import LLMJudge
1
+ from typing import Any, Dict
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.metric import Score
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
4
12
 
5
13
  TEMPLATE_0SHOT = """Please read the following text and answer the question below.
6
14
 
@@ -13,73 +21,123 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
13
21
  Format your response as follows: "Therefore, the answer is (insert answer here)"."""
14
22
 
15
23
 
16
- @Benchmark.register(
17
- name='docmath',
18
- pretty_name='DocMath',
19
- tags=['Reasoning', 'Mathematics', 'Long Context'],
20
- description=
21
- 'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
22
- dataset_id='yale-nlp/DocMath-Eval',
23
- metric_list=['AverageAccuracy'],
24
- subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
25
- few_shot_num=0,
26
- train_split=None,
27
- eval_split='test',
28
- prompt_template=TEMPLATE_0SHOT,
24
+ @register_benchmark(
25
+ BenchmarkMeta(
26
+ name='docmath',
27
+ pretty_name='DocMath',
28
+ tags=[Tags.REASONING, Tags.MATH, Tags.LONG_CONTEXT],
29
+ description=
30
+ 'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
31
+ dataset_id='yale-nlp/DocMath-Eval',
32
+ metric_list=['acc'],
33
+ subset_list=['complong_testmini', 'compshort_testmini', 'simplong_testmini', 'simpshort_testmini'],
34
+ eval_split='test',
35
+ prompt_template=TEMPLATE_0SHOT,
36
+ )
29
37
  )
30
- class DocMathAdapter(DataAdapter):
38
+ class DocMathAdapter(DefaultDataAdapter):
31
39
 
32
40
  def __init__(self, **kwargs):
33
41
  super().__init__(**kwargs)
42
+ self._use_llm_judge = True # Enable LLM judge for DocMath
43
+ self.split_as_subset = True # Use split as subset for DocMath
34
44
 
35
- def load(self, **kwargs):
36
- # default load mini test
37
- kwargs['split_as_subset'] = True
38
- data_dict = super().load(**kwargs)
39
- return data_dict
40
-
41
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
42
- """
43
- Generate model prompt from input data.
45
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
44
46
  """
45
- context = context = '\n'.join(input_d['paragraphs'])
46
- question = input_d['question']
47
- prompt = self.prompt_template.format(context=context, question=question)
48
- return self.gen_prompt_data(prompt)
47
+ Convert a data record to a Sample object.
49
48
 
50
- def get_gold_answer(self, input_d: dict) -> str:
51
- """
52
- Parse the raw input labels (gold).
53
- """
54
- return input_d['ground_truth']
49
+ Args:
50
+ record (Dict[str, Any]): Input data record.
55
51
 
56
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
52
+ Returns:
53
+ Sample: Sample object with input, target, and metadata.
54
+ """
55
+ ground_truth = record['ground_truth']
56
+
57
+ return Sample(
58
+ input=record['question'],
59
+ target=str(ground_truth),
60
+ metadata={
61
+ 'question_id': record.get('question_id', ''),
62
+ 'paragraphs': record['paragraphs'],
63
+ 'answer_type': type(ground_truth).__name__
64
+ }
65
+ )
66
+
67
+ def format_prompt_template(self, sample):
68
+ context = '\n'.join(sample.metadata['paragraphs'])
69
+ question = sample.input
70
+ return self.prompt_template.format(context=context, question=question)
71
+
72
+ def extract_answer(self, prediction: str, task_state: TaskState):
57
73
  """
58
- Parse the predicted result and extract proper answer.
74
+ Extract the answer from the model prediction.
59
75
  """
60
76
  from .utils import extract_answer
61
77
 
62
- extracted_answer = extract_answer(result)
78
+ extracted_answer = extract_answer(prediction)
63
79
  return extracted_answer
64
80
 
65
- def match(self, gold: str, pred: str) -> float:
81
+ def match_score(
82
+ self,
83
+ original_prediction: str,
84
+ filtered_prediction: str,
85
+ reference: str,
86
+ task_state: TaskState,
87
+ ) -> Score:
66
88
  """
67
- Match the gold answer and the predicted answer.
89
+ Calculate accuracy score by matching prediction with reference.
68
90
  """
69
91
  from .utils import get_acc
70
92
 
71
- return get_acc(prediction=pred, gt=gold)
72
-
73
- def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
93
+ score = Score(
94
+ extracted_prediction=filtered_prediction,
95
+ prediction=original_prediction,
96
+ )
97
+
98
+ answer_type = task_state.metadata.get('answer_type', 'unknown')
99
+ accuracy = get_acc(prediction=filtered_prediction, gt=reference, answer_type=answer_type)
100
+ score.value = {'acc': accuracy}
101
+ score.main_score_name = 'acc'
102
+
103
+ return score
104
+
105
+ def llm_match_score(
106
+ self,
107
+ original_prediction: str,
108
+ filtered_prediction: str,
109
+ reference: str,
110
+ task_state: TaskState,
111
+ ) -> Score:
112
+ """
113
+ Use LLM judge to evaluate the prediction against the reference.
114
+ """
74
115
  from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
75
116
 
76
- raw_input = kwargs.get('raw_input', None)
77
- question = raw_input['question']
78
- # get grading response
79
- prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
80
- orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
81
- # parse grading response
117
+ score = Score(
118
+ extracted_prediction=filtered_prediction,
119
+ prediction=original_prediction,
120
+ )
121
+
122
+ question = task_state.metadata.get('question', '')
123
+
124
+ # Get grading response
125
+ prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
126
+ orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
127
+
128
+ # Parse grading response
82
129
  if 'YES' in orm_response:
83
- return 1.0
130
+ accuracy = 1.0
84
131
  else:
85
- return 0.0
132
+ accuracy = 0.0
133
+
134
+ score.value = {'acc': accuracy}
135
+ score.explanation = f'LLM judge: {orm_response}'
136
+ score.metadata = {
137
+ 'source': 'llm_judge',
138
+ 'judge_strategy': self.judge_strategy,
139
+ 'model': self.llm_judge.model_id
140
+ }
141
+ score.main_score_name = 'acc'
142
+
143
+ return score
@@ -193,23 +193,22 @@ def compare_two_numbers(p, gt):
193
193
  return within_eps(pred=p, gt=gt)
194
194
 
195
195
 
196
- def get_acc(prediction, gt, cot=True):
196
+ def get_acc(prediction, gt, answer_type, cot=True):
197
197
  try:
198
198
  if cot:
199
199
  prediction = normalize(prediction)
200
200
  else:
201
201
  prediction = float(prediction)
202
202
 
203
- answer_type = type(gt).__name__
204
203
  assert answer_type in ['int', 'float', 'float64', 'bool'], answer_type
205
204
  if isinstance(prediction, (str, int, float, bool)) or isinstance(prediction, list):
206
205
  # Comparing prediction against the reference
207
206
  if answer_type in ['bool']:
208
- acc = int(prediction == gt)
207
+ acc = int(prediction == bool(gt))
209
208
  elif answer_type == 'int':
210
- acc = int(compare_two_numbers(prediction, gt))
209
+ acc = int(compare_two_numbers(prediction, int(gt)))
211
210
  elif answer_type == 'float' or answer_type == 'float64':
212
- acc = int(compare_two_numbers(prediction, gt))
211
+ acc = int(compare_two_numbers(prediction, float(gt)))
213
212
  else:
214
213
  acc = 0
215
214
  else:
@@ -1,8 +1,13 @@
1
+ import ast
1
2
  import re
2
- from typing import List
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType
3
+ from typing import Any, Dict, List
4
+
5
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
6
+ from evalscope.api.dataset import Sample
7
+ from evalscope.api.evaluator import TaskState
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
6
11
  from evalscope.utils.logger import get_logger
7
12
 
8
13
  logger = get_logger()
@@ -28,54 +33,82 @@ Answer: 43
28
33
  ''' # noqa: E501
29
34
 
30
35
 
31
- @Benchmark.register(
32
- name='drop',
33
- pretty_name='DROP',
34
- tags=['Reasoning'],
35
- description=
36
- 'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
37
- dataset_id='AI-ModelScope/DROP',
38
- metric_list=['AverageAccuracy'],
39
- few_shot_num=0,
40
- train_split=None,
41
- eval_split='validation',
42
- prompt_template=
43
- 'You will be asked to read a passage and answer a question.{drop_examples}# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
36
+ @register_benchmark(
37
+ BenchmarkMeta(
38
+ name='drop',
39
+ pretty_name='DROP',
40
+ tags=[Tags.REASONING],
41
+ description=
42
+ 'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
43
+ dataset_id='AI-ModelScope/DROP',
44
+ metric_list=['acc'],
45
+ few_shot_num=3,
46
+ train_split=None,
47
+ eval_split='validation',
48
+ prompt_template=
49
+ 'You will be asked to read a passage and answer a question. {drop_examples}\n# Your Task\n\n---\n{query}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.', # noqa: E501
50
+ )
44
51
  )
45
- class DROPAdapter(DataAdapter):
52
+ class DROPAdapter(DefaultDataAdapter):
46
53
 
47
54
  def __init__(self, **kwargs):
48
55
  super().__init__(**kwargs)
49
56
 
50
- few_shot_num = kwargs.get('few_shot_num', 0)
51
- if few_shot_num != 0:
57
+ if self.few_shot_num != 0:
52
58
  self.few_shot_num = 3
53
59
  logger.info(f'Few shot num is set to {self.few_shot_num} for DROP dataset by system.')
54
60
  else:
55
61
  self.few_shot_num = 0
56
62
 
57
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
63
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
58
64
  """
59
- Generate model prompt from input data.
65
+ Convert a data record to a Sample object.
66
+
67
+ Args:
68
+ record (Dict[str, Any]): Input data record.
69
+
70
+ Returns:
71
+ Sample: Sample object with input, target, and metadata.
60
72
  """
61
- drop_examples = '' if self.few_shot_num == 0 else DROP_EXAMPLES
62
- query = f"Passage: {input_d['passage']}\nQuestion: {input_d['question']}"
63
- prompt = self.prompt_template.format(
73
+
74
+ # Parse gold answers
75
+ gold_answers = self._get_gold_answers(record)
76
+
77
+ return Sample(
78
+ input=record['question'],
79
+ target=str(gold_answers),
80
+ metadata={
81
+ 'passage': record['passage'],
82
+ 'answer': record['answer'],
83
+ 'validated_answers': record['validated_answers']
84
+ }
85
+ )
86
+
87
+ def format_prompt_template(self, sample: Sample) -> str:
88
+ drop_examples = ''
89
+ query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
90
+
91
+ return self.prompt_template.format(
64
92
  drop_examples=drop_examples,
65
93
  query=query,
66
94
  )
67
- return self.gen_prompt_data(prompt)
68
95
 
69
- def get_gold_answer(self, input_d: dict) -> List[str]:
96
+ def format_fewshot_template(self, fewshot, sample):
97
+ drop_examples = DROP_EXAMPLES
98
+ query = f"Passage: {sample.metadata['passage']}\nQuestion: {sample.input}"
99
+
100
+ return self.prompt_template.format(
101
+ drop_examples=drop_examples,
102
+ query=query,
103
+ )
104
+
105
+ def _get_gold_answers(self, input_d: dict) -> List[str]:
70
106
  """
71
107
  Parse the raw input labels (gold).
72
108
  """
73
109
 
74
110
  def _flatten_validated_answers(validated_answers):
75
- """Flattens a dict of lists of validated answers.
76
- {"number": ['1', '8'], ...}
77
- -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
78
- """
111
+ """Flattens a dict of lists of validated answers."""
79
112
  valid_answers = []
80
113
  for i in range(len(validated_answers['number'])):
81
114
  valid_answers.append({
@@ -96,24 +129,36 @@ class DROPAdapter(DataAdapter):
96
129
  answers.append(answer)
97
130
  return answers
98
131
 
99
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
132
+ def extract_answer(self, prediction: str, task_state: TaskState):
100
133
  """
101
- Parse the predicted result and extract proper answer.
134
+ Extract the answer from the model prediction.
102
135
  """
103
- match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', result)
104
- extracted_answer = match.group(1) if match else result
136
+ match = re.search(r'(?i)Answer\s*:\s*([^\n]+)', prediction)
137
+ extracted_answer = match.group(1) if match else prediction
105
138
  return extracted_answer
106
139
 
107
- def match(self, gold: List[str], pred: str) -> float:
140
+ def match_score(
141
+ self,
142
+ original_prediction: str,
143
+ filtered_prediction: str,
144
+ reference: str,
145
+ task_state: TaskState,
146
+ ) -> Score:
108
147
  """
109
- Match the gold answer and the predicted answer.
148
+ Calculate accuracy score by matching prediction with reference answers.
110
149
  """
111
150
  from .utils import _answer_to_bags
112
151
 
152
+ score = Score(
153
+ extracted_prediction=filtered_prediction,
154
+ prediction=original_prediction,
155
+ )
156
+
113
157
  max_em = 0
114
- for gold_answer in gold:
158
+ reference = ast.literal_eval(reference) if isinstance(reference, str) else reference
159
+ for gold_answer in reference:
115
160
  # Convert the answers to bags of answers
116
- predicted_bags = _answer_to_bags(pred)
161
+ predicted_bags = _answer_to_bags(filtered_prediction)
117
162
  gold_bags = _answer_to_bags(gold_answer)
118
163
 
119
164
  if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
@@ -124,7 +169,10 @@ class DROPAdapter(DataAdapter):
124
169
  if gold_answer[0].strip():
125
170
  max_em = max(max_em, exact_match)
126
171
 
127
- return max_em
172
+ score.value = {'acc': max_em}
173
+ score.main_score_name = 'acc'
174
+
175
+ return score
128
176
 
129
177
  @staticmethod
130
178
  def parse_answer(answer):
@@ -1,6 +1,15 @@
1
- from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import EvalType, OutputType
3
- from evalscope.metrics import LLMJudge, exact_match
1
+ import os
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
4
13
 
5
14
  TEMPLATE_0SHOT = """Please read the following text and answer the question below.
6
15
 
@@ -13,52 +22,82 @@ TEMPLATE_0SHOT = """Please read the following text and answer the question below
13
22
  Format your response as follows: "Therefore, the answer is (insert answer here)"."""
14
23
 
15
24
 
16
- @Benchmark.register(
17
- name='frames',
18
- pretty_name='FRAMES',
19
- tags=['Reasoning', 'Long Context'],
20
- description=
21
- 'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
22
- dataset_id='iic/frames',
23
- model_adapter=OutputType.GENERATION,
24
- output_types=[OutputType.GENERATION],
25
- metric_list=['AverageAccuracy'],
26
- few_shot_num=0,
27
- train_split=None,
28
- eval_split='test',
29
- prompt_template=TEMPLATE_0SHOT,
25
+ @register_benchmark(
26
+ BenchmarkMeta(
27
+ name='frames',
28
+ pretty_name='FRAMES',
29
+ tags=[Tags.REASONING, Tags.LONG_CONTEXT],
30
+ description=
31
+ 'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
32
+ dataset_id='iic/frames',
33
+ metric_list=['acc'],
34
+ eval_split='test',
35
+ prompt_template=TEMPLATE_0SHOT,
36
+ )
30
37
  )
31
- class FramesAdapter(DataAdapter):
38
+ class FramesAdapter(DefaultDataAdapter):
32
39
 
33
40
  def __init__(self, **kwargs):
34
41
  super().__init__(**kwargs)
42
+ self._use_llm_judge = True # Enable LLM judge for FRAMES
43
+
44
+ def load(self):
45
+ # Try to load dataset from local disk
46
+ dataset_name_or_path = self.dataset_id
47
+ if os.path.exists(dataset_name_or_path):
48
+ logger.info(f'Loading dataset from {dataset_name_or_path}')
49
+ dataset_path = dataset_name_or_path
50
+ else:
51
+ from modelscope import dataset_snapshot_download
35
52
 
36
- def load(self, **kwargs):
37
- # default load with snapshot
38
- kwargs['file_structure'] = {'default': ['test.jsonl']}
39
- data_dict = super().load_with_snapshot(**kwargs)
40
- return data_dict
53
+ # Load dataset from remote
54
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
55
+ # download dataset snapshot
56
+ dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern='test.jsonl')
41
57
 
42
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
43
- """
44
- Generate model prompt from input data.
45
- """
46
- context = '\n'.join([f"{i['title']}\n{i['text']}" for i in input_d['wiki_items']])
47
- question = input_d['Prompt']
48
- prompt = self.prompt_template.format(context=context, question=question)
49
- return self.gen_prompt_data(prompt)
58
+ dataset = LocalDataLoader(
59
+ data_id_or_path=dataset_path,
60
+ split=self.eval_split,
61
+ sample_fields=self.record_to_sample,
62
+ subset='test',
63
+ limit=self.limit,
64
+ repeats=self.repeats
65
+ ).load()
50
66
 
51
- def get_gold_answer(self, input_d: dict) -> str:
52
- """
53
- Parse the raw input labels (gold).
67
+ test_dataset = DatasetDict({'test': dataset})
68
+
69
+ return test_dataset, None
70
+
71
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
54
72
  """
55
- return input_d['Answer']
73
+ Convert a data record to a Sample object.
56
74
 
57
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
75
+ Args:
76
+ record (Dict[str, Any]): Input data record.
77
+
78
+ Returns:
79
+ Sample: Sample object with input, target, and metadata.
80
+ """
81
+ context = '\n'.join([f"{i['title']}\n{i['text']}" for i in record['wiki_items']])
82
+ question = record['Prompt']
83
+
84
+ return Sample(
85
+ input=question, target=record['Answer'], metadata={
86
+ 'context': context,
87
+ 'wiki_items': record['wiki_items']
88
+ }
89
+ )
90
+
91
+ def format_prompt_template(self, sample):
92
+ context = sample.metadata['context']
93
+ question = sample.input
94
+ return self.prompt_template.format(context=context, question=question)
95
+
96
+ def extract_answer(self, prediction: str, task_state: TaskState):
58
97
  """
59
- Parse the predicted result and extract proper answer.
98
+ Extract the answer from the model prediction.
60
99
  """
61
- response = result.replace('*', '')
100
+ response = prediction.replace('*', '')
62
101
 
63
102
  if 'the answer is' in response:
64
103
  ans = response.rsplit('the answer is', 1)[-1].strip().strip('.').strip()
@@ -67,25 +106,69 @@ class FramesAdapter(DataAdapter):
67
106
 
68
107
  return ans
69
108
 
70
- def match(self, gold: str, pred: str) -> float:
109
+ def match_score(
110
+ self,
111
+ original_prediction: str,
112
+ filtered_prediction: str,
113
+ reference: str,
114
+ task_state: TaskState,
115
+ ) -> Score:
71
116
  """
72
- Match the gold answer and the predicted answer.
117
+ Calculate accuracy score by matching prediction with reference.
73
118
  """
119
+ from evalscope.metrics import exact_match
74
120
  from .utils import normalize_answer
75
- gold = normalize_answer(gold)
76
- pred = normalize_answer(pred)
77
- return exact_match(gold=gold, pred=pred)
78
121
 
79
- def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> float:
122
+ score = Score(
123
+ extracted_prediction=filtered_prediction,
124
+ prediction=original_prediction,
125
+ )
126
+
127
+ gold = normalize_answer(reference)
128
+ pred = normalize_answer(filtered_prediction)
129
+ accuracy = exact_match(gold=gold, pred=pred)
130
+
131
+ score.value = {'acc': accuracy}
132
+ score.main_score_name = 'acc'
133
+
134
+ return score
135
+
136
+ def llm_match_score(
137
+ self,
138
+ original_prediction: str,
139
+ filtered_prediction: str,
140
+ reference: str,
141
+ task_state: TaskState,
142
+ ) -> Score:
143
+ """
144
+ Use LLM judge to evaluate the prediction against the reference.
145
+ """
80
146
  from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE
81
147
 
82
- raw_input = kwargs.get('raw_input', None)
83
- question = raw_input['Prompt']
84
- # get grading response
85
- prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=gold, answer_2=pred)
86
- orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
87
- # parse grading response
148
+ score = Score(
149
+ extracted_prediction=filtered_prediction,
150
+ prediction=original_prediction,
151
+ )
152
+
153
+ question = task_state.input_text
154
+
155
+ # Get grading response
156
+ prompt = ORM_USER_TEMPLATE.format(problem=question, answer_1=reference, answer_2=filtered_prediction)
157
+ orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
158
+
159
+ # Parse grading response
88
160
  if 'YES' in orm_response:
89
- return 1.0
161
+ accuracy = 1.0
90
162
  else:
91
- return 0.0
163
+ accuracy = 0.0
164
+
165
+ score.value = {'acc': accuracy}
166
+ score.explanation = f'LLM judge: {orm_response}'
167
+ score.metadata = {
168
+ 'source': 'llm_judge',
169
+ 'judge_strategy': self.judge_strategy,
170
+ 'model': self.llm_judge.model_id
171
+ }
172
+ score.main_score_name = 'acc'
173
+
174
+ return score