evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,81 +1,102 @@
1
1
  import json
2
- from typing import Dict, List
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType, OutputType
6
- from evalscope.metrics import Metric, mean, metric_registry
7
-
8
-
9
- @Benchmark.register(
10
- name='tool_bench',
11
- pretty_name='ToolBench-Static',
12
- tags=['Reasoning', 'Agent', 'Function Calling'],
13
- description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
14
- 'It includes various subsets such as in-domain and out-of-domain, '
15
- 'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
16
- '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)', # noqa: E501
17
- dataset_id='AI-ModelScope/ToolBench-Static',
18
- subset_list=['in_domain', 'out_of_domain'],
19
- metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
20
- few_shot_num=0,
21
- train_split=None,
22
- eval_split='test',
2
+ from typing import Any, Dict
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages.chat_message import ChatMessage, dict_to_chat_message
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ @register_benchmark(
17
+ BenchmarkMeta(
18
+ name='tool_bench',
19
+ pretty_name='ToolBench-Static',
20
+ tags=[Tags.REASONING, Tags.FUNCTION_CALLING],
21
+ description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
22
+ 'It includes various subsets such as in-domain and out-of-domain, '
23
+ 'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
24
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)',
25
+ dataset_id='AI-ModelScope/ToolBench-Static',
26
+ subset_list=['in_domain', 'out_of_domain'],
27
+ metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
28
+ eval_split='test',
29
+ )
23
30
  )
24
- class ToolBenchAdapter(DataAdapter):
31
+ class ToolBenchAdapter(DefaultDataAdapter):
32
+ """
33
+ ToolBench adapter using the new data processing framework.
34
+ """
25
35
 
26
36
  def __init__(self, **kwargs):
27
37
  super().__init__(**kwargs)
28
38
 
29
- metric_registry.register(Metric(name='Rouge-L', object=mean))
30
- metric_registry.register(Metric(name='Act.EM', object=mean))
31
- metric_registry.register(Metric(name='Plan.EM', object=mean))
32
- metric_registry.register(Metric(name='F1', object=mean))
33
- metric_registry.register(Metric(name='HalluRate', object=mean))
34
-
35
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
36
- """
37
- Generate model prompt from input data.
38
- """
39
- messages = input_d['messages']
40
- # use prepared messages and remove the name field
39
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
40
+ """Convert a data record to a Sample object."""
41
+ messages = record['messages']
42
+
43
+ # Process messages and remove the name field, convert function messages
44
+ processed_messages = []
41
45
  for message in messages:
42
- if 'name' in message:
43
- del message['name']
44
- if 'role' in message:
45
- if message['role'] == 'function':
46
- content = json.dumps(message, ensure_ascii=False)
47
- message['role'] = 'user'
48
- message['content'] = content
49
- return self.gen_prompt_data(prompt='', messages=messages)
50
-
51
- def get_gold_answer(self, input_d: dict) -> str:
52
- """
53
- Parse the raw input labels (gold).
54
- """
55
- return input_d
56
-
57
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
58
- """
59
- Parse the predicted result and extract proper answer.
60
- """
61
- return result
62
-
63
- def match(self, gold: dict, pred: str) -> Dict:
64
- """
65
- Match the gold answer and the predicted answer.
66
- """
46
+ msg_dict = message.copy()
47
+ if 'name' in msg_dict:
48
+ del msg_dict['name']
49
+ if 'role' in msg_dict:
50
+ if msg_dict['role'] == 'function':
51
+ content = json.dumps(msg_dict, ensure_ascii=False)
52
+ msg_dict['role'] = 'user'
53
+ msg_dict['content'] = content
54
+
55
+ # Convert to ChatMessage object
56
+ chat_msg = dict_to_chat_message(msg_dict)
57
+ processed_messages.append(chat_msg)
58
+
59
+ return Sample(
60
+ input=processed_messages,
61
+ target='', # Store the full record as target for evaluation
62
+ metadata={
63
+ 'target': record['target'],
64
+ 'tools': record['tools'],
65
+ 'messages': record['messages']
66
+ }
67
+ )
68
+
69
+ def match_score(
70
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
71
+ ) -> Score:
67
72
  from .utils import calculate_metrics
68
73
 
69
- data = {
70
- 'target': gold['target'],
71
- 'predictions': pred,
72
- 'tools': gold['tools'],
73
- }
74
- metrics = calculate_metrics(data)
75
- return metrics
74
+ score = Score(
75
+ extracted_prediction=filtered_prediction,
76
+ prediction=original_prediction,
77
+ )
78
+
79
+ doc = task_state.metadata
80
+
81
+ try:
82
+ data = {
83
+ 'target': doc['target'],
84
+ 'predictions': filtered_prediction,
85
+ 'tools': doc['tools'],
86
+ }
87
+ metrics = calculate_metrics(data)
88
+
89
+ score.value = metrics
90
+ score.explanation = f'Metrics: {metrics}'
91
+ score.metadata = {'target': doc['target'], 'tools': doc['tools'], 'detailed_metrics': metrics}
92
+ # Set the main score (you can choose the most important metric)
93
+ score.main_score_name = 'F1'
76
94
 
77
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
78
- # aggregate review results
79
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
95
+ except Exception as e:
96
+ # Handle evaluation errors
97
+ score.value = {'Act.EM': 0.0, 'Plan.EM': 0.0, 'F1': 0.0, 'HalluRate': 1.0, 'Rouge-L': 0.0}
98
+ score.explanation = f'Evaluation failed: {str(e)}'
99
+ score.metadata = {'error': str(e)}
100
+ score.main_score_name = 'F1'
80
101
 
81
- return super().compute_metric(res_dict, **kwargs)
102
+ return score
@@ -1,142 +1,74 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI Inc, and its affiliates.
3
- import csv
4
- import os
5
3
 
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.benchmarks.data_adapter import DataAdapter
8
- from evalscope.constants import EvalType, OutputType
9
- from evalscope.utils import get_logger
4
+ from typing import Any, Dict
10
5
 
11
- # flake8: noqa
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.metric import Score
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.logger import get_logger
12
13
 
13
14
  logger = get_logger()
14
15
 
16
+ PROMPT_TEMPLATE = """
17
+ Read the content and answer the following question.
15
18
 
16
- @Benchmark.register(
17
- name='trivia_qa',
18
- pretty_name='TriviaQA',
19
- tags=['QA', 'Reading Comprehension'],
20
- description=
21
- 'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.', # noqa: E501
22
- dataset_id='modelscope/trivia_qa',
23
- subset_list=['default'],
24
- metric_list=['AverageAccuracy'],
25
- few_shot_num=5,
26
- train_split='dev',
27
- eval_split='test',
28
- )
29
- class TriviaQaAdapter(DataAdapter):
30
-
31
- def __init__(self, **kwargs):
32
-
33
- super().__init__(**kwargs)
19
+ Content: {content}
34
20
 
35
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
36
- data_dict = {}
37
- for subset_name in subset_list:
38
- data_dict[subset_name] = {}
39
- for split in [self.train_split, self.eval_split]:
40
- if os.path.exists(dataset_name_or_path):
41
- file_path = os.path.join(dataset_name_or_path, f'trivia-{split}.qa.csv')
42
- else:
43
- file_path = os.path.join(work_dir, dataset_name_or_path, f'trivia-{split}.qa.csv')
44
- if os.path.exists(file_path):
45
- with open(file_path, 'r', encoding='utf-8') as f:
46
- reader = csv.reader(f, delimiter='\t')
47
- split_data = []
48
- for row in reader:
49
- assert len(row) == 2
50
- question = row[0]
51
- answers = eval(row[1])
52
- split_data.append({
53
- 'input': [{
54
- 'role': 'system',
55
- 'content': 'Follow the given examples and answer the question.'
56
- }, {
57
- 'role': 'user',
58
- 'content': question
59
- }],
60
- 'ideal':
61
- answers
62
- })
63
- data_dict[subset_name][split] = split_data
21
+ Question: {question}
64
22
 
65
- return data_dict
23
+ Keep your The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
24
+ """.lstrip() # noqa: E501
66
25
 
67
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
68
- """
69
- Generate model prompt from raw input, unify the prompt format for TriviaQA benchmark.
70
26
 
71
- Args:
72
- input_d (dict): The raw input. A single data format of the TriviaQA:
73
-
74
- {
75
- "input": [
76
- {"role": "system", "content": "Follow the given examples and answer the question."},
77
- {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}
78
- ],
79
- "ideal": [
80
- "Sunset Blvd",
81
- "West Sunset Boulevard",
82
- "Sunset Boulevard",
83
- "Sunset Bulevard",
84
- "Sunset Blvd.",
85
- "sunset boulevard",
86
- "sunset bulevard",
87
- "west sunset boulevard",
88
- "sunset blvd"
89
- ]
27
+ @register_benchmark(
28
+ BenchmarkMeta(
29
+ name='trivia_qa',
30
+ pretty_name='TriviaQA',
31
+ dataset_id='evalscope/trivia_qa',
32
+ tags=[Tags.QA, Tags.READING_COMPREHENSION],
33
+ description=
34
+ 'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.', # noqa: E501
35
+ subset_list=['rc.wikipedia'],
36
+ few_shot_num=0,
37
+ train_split=None,
38
+ eval_split='validation',
39
+ metric_list=[{
40
+ 'acc': {
41
+ 'allow_inclusion': True
90
42
  }
43
+ }],
44
+ prompt_template=PROMPT_TEMPLATE,
45
+ )
46
+ )
47
+ class TriviaQaAdapter(DefaultDataAdapter):
91
48
 
92
- Returns:
93
- {'data': [(context, continuation), ...]}
94
- """
95
-
96
- def get_sys_prompt(inp: dict) -> str:
97
- return inp['input'][0]['content']
98
-
99
- if self.few_shot_num > 0:
100
- sys_prompt = get_sys_prompt(input_d)
101
- else:
102
- sys_prompt = None
103
- few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
104
- context = '\n'.join(few_shot_prompts) + '\n'
105
- context += self._generate_prompt(input_d=input_d, include_answer=False)
106
- full_prompt = context
107
-
108
- return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
109
-
110
- def get_gold_answer(self, input_d: dict) -> list:
111
- # Get the gold choice
112
- ans: list = input_d.get('ideal', [])
113
- return ans
114
-
115
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
116
- """
117
- Parse the model output to get the answer.
118
-
119
- Args:
120
- result: Predicted answer from the model. A list of loglikelihood values for inputs pairs.
121
- raw_input_d: The raw input. A single data format of the TriviaQA:
122
- eval_type: The type of evaluation, e.g. 'checkpoint' or 'service' or 'custom'.
123
-
124
- Returns:
125
- The predicted answer.
126
- """
127
- return result
49
+ def __init__(self, **kwargs):
50
+ super().__init__(**kwargs)
128
51
 
129
- def match(self, gold: list, pred: str) -> float:
130
- lower_pred = pred.lower()
131
- gold = [g.lower() for g in gold]
132
- is_correct = any([cand in lower_pred for cand in gold])
133
- return 1 if is_correct else 0
52
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
53
+ question = record['question']
54
+ answers = record['answer']['aliases'] + record['answer']['normalized_aliases']
55
+ content = record['entity_pages']['wiki_context']
56
+ return Sample(
57
+ input=question, target=answers, metadata={
58
+ 'question_id': record['question_id'],
59
+ 'content': content
60
+ }
61
+ )
134
62
 
135
- @classmethod
136
- def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
63
+ def format_prompt_template(self, sample):
64
+ return self.prompt_template.format(content=sample.metadata['content'], question=sample.input)
137
65
 
138
- example: str = f"Question: {input_d['input'][1]['content']}\nAnswer:"
139
- if include_answer:
140
- example += f" {input_d['ideal'][0]}\n\n"
66
+ def extract_answer(self, prediction: str, task_state: TaskState):
67
+ # use regex to extract the answer from the prediction
68
+ import re
141
69
 
142
- return example
70
+ pattern = r'ANSWER:\s*(.*)'
71
+ match = re.search(pattern, prediction)
72
+ if match:
73
+ return match.group(1).strip()
74
+ return prediction.strip()