evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,36 +1,16 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import csv
4
- import os
5
- from collections import defaultdict
6
-
7
- from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.constants import EvalType, OutputType
9
- from evalscope.metrics import exact_match
10
- from evalscope.metrics.completion_parsers import ResponseParser
11
- from evalscope.utils.io_utils import csv_to_list
3
+ from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.registry import register_benchmark
6
+ from evalscope.constants import Tags
12
7
  from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.multi_choices import MultipleChoiceTemplate
13
9
 
14
10
  # flake8: noqa
15
11
 
16
12
  logger = get_logger()
17
13
 
18
- SUBSET_LIST = [
19
- 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
20
- 'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
21
- 'chinese_teacher_qualification', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology',
22
- 'college_law', 'college_mathematics', 'college_medical_statistics', 'clinical_knowledge', 'college_medicine',
23
- 'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics',
24
- 'education', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
25
- 'electrical_engineering', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
26
- 'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics',
27
- 'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism',
28
- 'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
29
- 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law',
30
- 'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology',
31
- 'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
32
- ]
33
-
34
14
  SUBJECT_MAPPING = {
35
15
  'agronomy': ['other', 'Other'],
36
16
  'anatomy': ['biology', 'STEM'],
@@ -102,112 +82,41 @@ SUBJECT_MAPPING = {
102
82
  }
103
83
 
104
84
 
105
- @Benchmark.register(
106
- name='cmmlu',
107
- pretty_name='C-MMLU',
108
- tags=['Knowledge', 'MCQ', 'Chinese'],
109
- description=
110
- 'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
111
- dataset_id='modelscope/cmmlu',
112
- model_adapter=OutputType.GENERATION,
113
- output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
114
- subset_list=SUBSET_LIST,
115
- metric_list=['AverageAccuracy'],
116
- few_shot_num=5,
117
- train_split='dev',
118
- eval_split='test',
119
- prompt_template=
120
- '以下是关于{subset_name}的单项选择题,请给出正确答案的选项。你的回答的最后一行应该是这样的格式:“答案:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
85
+ @register_benchmark(
86
+ BenchmarkMeta(
87
+ name='cmmlu',
88
+ pretty_name='C-MMLU',
89
+ tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
90
+ description=
91
+ 'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
92
+ dataset_id='evalscope/cmmlu',
93
+ metric_list=['acc'],
94
+ subset_list=list(SUBJECT_MAPPING.keys()),
95
+ few_shot_num=0,
96
+ train_split=None,
97
+ eval_split='test',
98
+ prompt_template=MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
99
+ )
121
100
  )
122
- class CMMLUAdapter(DataAdapter):
101
+ class CMMLUAdapter(MultiChoiceAdapter):
123
102
 
124
103
  def __init__(self, **kwargs):
125
104
  super().__init__(**kwargs)
126
105
 
106
+ self.reformat_subset = True
127
107
  self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
128
- self.choices = ['A', 'B', 'C', 'D']
129
-
130
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
131
- data_dict = defaultdict(dict)
132
- for subset_name in subset_list:
133
- for split_name in [self.train_split, self.eval_split]:
134
- if os.path.exists(dataset_name_or_path):
135
- file_path = os.path.join(dataset_name_or_path, split_name, f'{subset_name}.csv')
136
- else:
137
- file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
138
- if os.path.exists(file_path):
139
- data_dict[subset_name][split_name] = csv_to_list(file_path)
140
-
141
- return data_dict
142
-
143
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
144
- """
145
- Generate model prompt from raw input, unify the prompt format for CMMLU benchmark.
146
-
147
- Args:
148
- input_d (dict): The raw input. A single data format of the CMMLU:
149
-
150
- {'Question': '下列关于重力的说法正确的是',
151
- 'A': '在地球周围的物体都要受到重力作用,与其运动状态无关',
152
- 'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变',
153
- 'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下',
154
- 'D': '在地球表面各处的重力方向都是相同的',
155
- 'Answer': 'A'}
156
-
157
- Returns:
158
- {'data': [(context, continuation), ...]}
159
-
160
- """
161
- few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
162
- context = '\n'.join(few_shot_prompts) + '\n'
163
- context += self._generate_prompt(input_d=input_d, include_answer=False)
164
-
165
- full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
166
-
167
- return self.gen_prompt_data(full_prompt)
168
-
169
- def get_gold_answer(self, input_d: dict) -> str:
170
- # Get the gold choice
171
- return input_d.get('Answer', '')
172
-
173
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
174
- """
175
- Parse the model output to get the answer. Could be the best choice index.
176
-
177
- Args:
178
- result: Predicted answer from the model. Usually a string for chat.
179
- raw_input_d: The raw input. Depending on the dataset.
180
- eval_type: The evaluation type. 'checkpoint', 'service', 'custom'.
181
-
182
- Returns:
183
- The parsed answer. Depending on the dataset. Usually a string for chat.
184
- """
185
- if self.model_adapter == OutputType.MULTIPLE_CHOICE:
186
- return result
187
- else:
188
- return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
189
-
190
- def match(self, gold: str, pred: str) -> float:
191
- return exact_match(gold=gold, pred=pred)
192
-
193
- def _generate_prompt(self, input_d: dict, include_answer=True) -> str:
194
-
195
- input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
196
-
197
- example: str = input_d['Question']
198
- for j in range(len(self.choices)):
199
- example += '\n{}. {}'.format(self.choices[j], input_choices[j])
200
108
 
201
- example += '\nAnswer:'
202
- if include_answer:
203
- example += ' {}\n\n'.format(input_d['Answer'])
109
+ def record_to_sample(self, record) -> Sample:
204
110
 
205
- return example
111
+ # choices: ["(A) 农业生产工具","(B) 土地","(C) 劳动力","(D) 资金"]
112
+ # remove the leading (A), (B), (C), (D)
113
+ raw_choices = record['choices']
114
+ choice_list = [choice[3:].strip() for choice in raw_choices]
206
115
 
207
- @classmethod
208
- def _format_subject(cls, subject):
209
- l = subject.split('_')
210
- s = ''
211
- for entry in l:
212
- s += ' ' + entry
213
- return s
116
+ return Sample(
117
+ input=record['question'],
118
+ choices=choice_list,
119
+ target=record['answer'][1], # answer is like "A"
120
+ subset_key=record['category'],
121
+ metadata={'subject': record['category']},
122
+ )
@@ -1,125 +1,73 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI, Inc. and its affiliates.
3
- import glob
4
- import json
5
- import os
6
- from collections import defaultdict
7
3
 
8
- from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
+ from typing import Any, Dict
5
+
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
10
11
  from evalscope.utils.logger import get_logger
11
12
 
12
13
  # flake8: noqa
13
14
 
14
15
  logger = get_logger()
15
16
 
16
-
17
- @Benchmark.register(
18
- name='competition_math',
19
- pretty_name='MATH',
20
- tags=['Mathematics'],
21
- description=
22
- 'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
23
- dataset_id='modelscope/competition_math',
24
- subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
25
- metric_list=['AveragePass@1'],
26
- few_shot_num=4,
27
- train_split=None,
28
- eval_split='test',
29
- prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
17
+ PROMPT_TEMPLATE = """
18
+ Problem:
19
+ {question}
20
+
21
+ Please reason step by step, and put your final answer within \\boxed{{}}.
22
+ """.lstrip()
23
+
24
+ FEWSHOT_TEMPLATE = """
25
+ Here are some examples of how to solve similar problems:
26
+
27
+ {fewshot}
28
+ """.lstrip() + PROMPT_TEMPLATE
29
+
30
+
31
+ @register_benchmark(
32
+ BenchmarkMeta(
33
+ name='competition_math',
34
+ pretty_name='MATH',
35
+ tags=[Tags.MATH, Tags.REASONING],
36
+ description=
37
+ 'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
38
+ dataset_id='evalscope/competition_math',
39
+ subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
40
+ metric_list=[{
41
+ 'acc': {
42
+ 'numeric': True
43
+ }
44
+ }],
45
+ few_shot_num=4,
46
+ train_split='train',
47
+ eval_split='test',
48
+ prompt_template=PROMPT_TEMPLATE,
49
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
50
+ )
30
51
  )
31
- class CompetitionMathAdapter(DataAdapter):
32
- """ To be tested for all models. """
52
+ class CompetitionMathAdapter(DefaultDataAdapter):
33
53
 
34
54
  def __init__(self, **kwargs):
35
-
36
- few_shot_num = kwargs.get('few_shot_num', 4)
37
- if few_shot_num != 4 and few_shot_num != 0:
38
- logger.error(f'The MATH benchmark ONLY supports 4-shot by system or 0-shot settings, '
39
- f'but got {few_shot_num}. Use 4-shot by default.')
40
- kwargs['few_shot_num'] = 4
41
-
42
55
  super().__init__(**kwargs)
43
56
 
44
- def load(self, **kwargs):
45
- # default load all levels
46
- kwargs['subset_list'] = ['default']
47
- data_dict = super().load(**kwargs)
48
- return self.reformat_subset(data_dict, subset_key='level')
49
-
50
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
51
- data_dict = defaultdict(dict)
52
- for subset_name in subset_list:
53
- for split_name in [self.train_split, self.eval_split]:
54
- if os.path.exists(dataset_name_or_path):
55
- split_dir = os.path.join(dataset_name_or_path, split_name)
56
- else:
57
- split_dir = os.path.join(work_dir, dataset_name_or_path, split_name)
58
- split_files = glob.glob(os.path.join(split_dir, '**', '*.json'))
59
- split_data = []
60
- for file_path in split_files:
61
- if os.path.exists(file_path):
62
- with open(file_path, 'r', encoding='utf-8') as f:
63
- split_data.append(json.load(f))
64
- data_dict[subset_name][split_name] = split_data
65
-
66
- return data_dict
67
-
68
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
69
- """
70
- Generate the prompt for the model input.
71
-
72
- Args:
73
- input_d: raw input dict.
74
- {"problem": "How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?", "level": "Level 3", "type": "Algebra", "solution": "The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$. Therefore, the graph has $\\boxed{2}$ vertical asymptotes."}
75
-
76
- few_shot_list: few shot list. Each item is a raw input dict.
77
- **kwargs:
78
-
79
- Returns:
80
- {'data': [prompt]}
81
- """
82
- use_fewshot = self.few_shot_num > 0
83
- query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
84
- full_prompt = self.prompt_template.format(query=query)
85
- return self.gen_prompt_data(full_prompt)
86
-
87
- def get_gold_answer(self, input_d: dict) -> str:
88
- # Extract the gold answer from the input dict.
89
- return strip_answer_string(extract_answer(input_d['solution']))
90
-
91
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
92
- """
93
- Parse the model output to get the answer. Could be the best choice index.
94
-
95
- Args:
96
- result: Predicted answer from the model. Usually a string for chat.
97
- raw_input_d (dict): The raw input. Depending on the dataset.
98
- eval_type: 'checkpoint' or 'service' or `custom`
99
-
100
- Returns:
101
- The parsed answer. Depending on the dataset. Usually a string for chat.
102
- """
103
- # Note: Use same extraction method for both of checkpoint/service/custom
104
- result = strip_answer_string(extract_answer(result))
105
- return result
57
+ self.reformat_subset = True
106
58
 
107
- def match(self, gold: str, pred: str) -> float:
108
- res = math_equal(pred, gold)
109
- return 1.0 if res else 0.0
59
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
60
+ from evalscope.metrics.math_parser import extract_answer
110
61
 
111
- @classmethod
112
- def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
113
- problem: str = input_d['problem']
62
+ return Sample(
63
+ input=record['problem'],
64
+ target=extract_answer(record['solution']),
65
+ subset_key=record['level'],
66
+ metadata={
67
+ 'reasoning': record.get('solution', ''),
68
+ 'type': record.get('type', ''),
69
+ },
70
+ )
114
71
 
115
- if use_fewshot:
116
- # Use 4-shot examples by system
117
- context = (
118
- 'Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:\nThe expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'
119
- 'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
120
- 'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
121
- 'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
122
- f'Problem:\n{problem}\nSolution:\n')
123
- else:
124
- context = 'Problem:\n' + problem + '\nSolution:\n'
125
- return context
72
+ def sample_to_fewshot(self, sample: Sample) -> str:
73
+ return f'Problem:\n{sample.input}\nSolution:\n{sample.target}'
@@ -1,28 +1,35 @@
1
- import math
1
+ import copy
2
2
  import os
3
- import re
4
- from typing import Any, Optional
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List
5
5
 
6
- from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
8
- from evalscope.utils.io_utils import jsonl_to_list
6
+ from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
7
+ from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.metric import Score
10
+ from evalscope.api.metric.scorer import AggScore, SampleScore
11
+ from evalscope.api.model.model import Model
12
+ from evalscope.api.registry import get_benchmark, register_benchmark
13
+ from evalscope.config import TaskConfig
14
+ from evalscope.constants import DataCollection, Tags
15
+ from evalscope.report.generator import ReportGenerator
16
+ from evalscope.report.report import Report
9
17
  from evalscope.utils.logger import get_logger
10
18
 
11
19
  logger = get_logger()
12
20
 
13
21
 
14
- @Benchmark.register(
15
- name='data_collection',
16
- dataset_id='', # dataset_id need to be set
17
- description='Data collection',
18
- subset_list=['default'],
19
- metric_list=['AverageAccuracy'],
20
- few_shot_num=0,
21
- train_split=None,
22
- eval_split='test',
23
- prompt_template='',
22
+ @register_benchmark(
23
+ BenchmarkMeta(
24
+ name=DataCollection.NAME,
25
+ dataset_id='', # dataset_id need to be set
26
+ description='Data collection',
27
+ metric_list=['acc'],
28
+ eval_split='test',
29
+ prompt_template='',
30
+ )
24
31
  )
25
- class DataCollectionAdapter(DataAdapter):
32
+ class DataCollectionAdapter(DefaultDataAdapter):
26
33
 
27
34
  def __init__(self, **kwargs):
28
35
  """
@@ -30,43 +37,174 @@ class DataCollectionAdapter(DataAdapter):
30
37
  """
31
38
  super().__init__(**kwargs)
32
39
 
33
- def load(self,
34
- dataset_name_or_path: str = None,
35
- subset_list: list = None,
36
- work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
37
- datasets_hub: str = HubType.MODELSCOPE,
38
- **kwargs) -> dict:
39
- """
40
- Load the dataset. Remote and local datasets are supported.
41
- """
42
- dataset_name_or_path = os.path.expanduser(dataset_name_or_path or self.dataset_id)
43
- subset_list = subset_list or self.subset_list
44
-
40
+ def load(self):
45
41
  # Try to load dataset from local disk
42
+ dataset_name_or_path = self.dataset_id
46
43
  if os.path.exists(dataset_name_or_path):
47
44
  logger.info(f'Loading dataset from {dataset_name_or_path}')
48
- dataset = jsonl_to_list(dataset_name_or_path)
49
- if len(dataset) == 0:
50
- raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
45
+ dataset_path = dataset_name_or_path
51
46
  else:
52
47
  from modelscope import dataset_snapshot_download
53
48
 
54
49
  # Load dataset from remote
55
- logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
50
+ logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
51
+ # download dataset snapshot
52
+ dataset_path = dataset_snapshot_download(dataset_name_or_path, allow_file_pattern='*.jsonl')
53
+
54
+ dataset = LocalDataLoader(
55
+ data_id_or_path=dataset_path,
56
+ split=self.eval_split,
57
+ sample_fields=self.record_to_sample,
58
+ subset=self.default_subset,
59
+ limit=self.limit,
60
+ repeats=self.repeats
61
+ ).load()
62
+
63
+ test_dataset = DatasetDict({self.default_subset: dataset})
64
+
65
+ return test_dataset, None
66
+
67
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
68
+ """
69
+ Convert a data record to a Sample object. Every record is a DatasetEntry.
70
+
71
+ Args:
72
+ record (Dict[str, Any]): Input data record.
73
+
74
+ Returns:
75
+ Sample: Sample object with input, target, and metadata.
76
+ """
77
+ from evalscope.collections import DatasetEntry
78
+
79
+ entry = DatasetEntry.model_validate(record)
80
+ sample = Sample.model_validate(entry.prompt)
81
+
82
+ record_without_prompt = copy.deepcopy(record)
83
+ del record_without_prompt['prompt']
84
+ sample.metadata[DataCollection.INFO] = record_without_prompt # keep all metadata
85
+ return sample
86
+
87
+ def _post_process_samples(self):
88
+ """Post process of each sample"""
89
+ self._initialize_adapters()
90
+
91
+ def _initialize_adapters(self):
92
+ """Init adapters for each dataset and create dataset id map"""
93
+ self.dataset_adapters: Dict[str, DataAdapter] = {}
94
+ self.dataset_name_map = defaultdict(lambda: defaultdict(list))
95
+
96
+ # load dataset args
97
+ dataset_args = copy.deepcopy(self._task_config.dataset_args)
98
+ common_args = dataset_args.get(DataCollection.NAME, {})
99
+
100
+ # Iterate through each sample in the dataset
101
+ dataset = self.test_dataset[self.default_subset]
102
+ for sample in dataset:
103
+ collection_info = sample.metadata.get(DataCollection.INFO, {})
104
+ dataset_name = collection_info.get('dataset_name', '')
105
+ subset_name = collection_info.get('subset_name', '')
106
+ # create id mapping
107
+ self.dataset_name_map[dataset_name][subset_name].append(sample.id)
108
+
109
+ # update dataset args
110
+ cur_dataset_args = dataset_args.get(dataset_name, {})
111
+ cur_dataset_args.update(common_args)
112
+
113
+ # Initialize dataset adapter
114
+ if dataset_name not in self.dataset_adapters:
115
+ config = TaskConfig(dataset_args={dataset_name: cur_dataset_args})
116
+ self.dataset_adapters[dataset_name] = get_benchmark(dataset_name, config=config)
117
+
118
+ def _get_adapter(self, metadata: Dict[str, Any]) -> DataAdapter:
119
+ collection_info = metadata.get(DataCollection.INFO, {})
120
+ dataset_name = collection_info.get('dataset_name', '')
121
+ return self.dataset_adapters.get(dataset_name)
122
+
123
+ def run_inference(self, model, sample, output_dir, **kwargs) -> TaskState:
124
+ data_adapter = self._get_adapter(sample.metadata)
125
+ if not data_adapter:
126
+ raise ValueError(f'No data adapter found for sample: {sample}')
127
+
128
+ return data_adapter.run_inference(model, sample, output_dir, **kwargs)
129
+
130
+ def calculate_metrics(self, task_state) -> SampleScore:
131
+ data_adapter = self._get_adapter(task_state.metadata)
132
+ if not data_adapter:
133
+ raise ValueError(f'No data adapter found for task state: {task_state}')
134
+
135
+ return data_adapter.calculate_metrics(task_state)
136
+
137
+ def aggregate_scores(self, sample_scores: List[SampleScore]):
138
+ import pandas as pd
139
+ from tabulate import tabulate
140
+
141
+ data = []
142
+ for sample_score in sample_scores:
143
+ collection_info = sample_score.sample_metadata[DataCollection.INFO]
144
+ for metric_name, value in sample_score.score.value.items():
145
+ data.append(
146
+ dict(
147
+ task_type=collection_info['task_type'],
148
+ categories=tuple(collection_info['categories']),
149
+ dataset_name=collection_info['dataset_name'],
150
+ subset_name=collection_info['subset_name'],
151
+ tags=collection_info['tags'],
152
+ sample_id=sample_score.sample_id,
153
+ metric=metric_name,
154
+ score=value
155
+ )
156
+ )
157
+
158
+ df = pd.DataFrame(data)
159
+
160
+ def aggregate_and_sort(df, group_by_cols):
161
+ # aggregate by group_by_cols, and calculate average_score and count
162
+ report_df = df.groupby(group_by_cols) \
163
+ .agg(average_score=('score', 'mean'), count=('score', 'size')) \
164
+ .reset_index()
165
+ report_df['average_score'] = report_df['average_score'].round(4)
166
+ report_df = report_df.sort_values(by='count', ascending=False) \
167
+ .to_dict(orient='records')
168
+ return report_df
169
+
170
+ # multi-level aggregation
171
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
172
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
173
+ task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
174
+
175
+ # explode tags to multiple rows
176
+ df_exploded_tags = df.explode('tags')
177
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
56
178
 
57
- dataset_path = dataset_snapshot_download(
58
- dataset_name_or_path, cache_dir=work_dir, allow_file_pattern='*.jsonl')
59
- # find the jsonl file
60
- dataset_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
61
- dataset = jsonl_to_list(dataset_files[0])
179
+ # process multi-level categories
180
+ df_categories = df.copy()
181
+ # multi-level aggregation for categories
182
+ max_depth = df_categories['categories'].apply(len).max()
183
+ for level in range(max_depth):
184
+ df_categories[f'category{level}'] = df_categories['categories'].apply(
185
+ lambda x: x[level] if len(x) > level else ''
186
+ )
187
+ category_report_df = aggregate_and_sort(
188
+ df_categories, [f'category{level}' for level in range(max_depth)] + ['metric']
189
+ )
62
190
 
63
- return dataset
191
+ # convert to dict format
192
+ report_dict = {
193
+ 'subset_level': subset_report_df,
194
+ 'dataset_level': dataset_report_df,
195
+ 'task_level': task_report_df,
196
+ 'tag_level': tag_report_df,
197
+ 'category_level': category_report_df,
198
+ }
64
199
 
65
- def get_gold_answer(self, input_d: Any) -> Any:
66
- return super().get_gold_answer(input_d)
200
+ # record report
201
+ for level, data in report_dict.items():
202
+ table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
203
+ logger.info(f'{level} Report:\n{table}')
67
204
 
68
- def match(self, gold: Any, pred: Any) -> Any:
69
- return super().match(gold, pred)
205
+ return df
70
206
 
71
- def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
72
- return super().parse_pred_result(result, raw_input_d, eval_type)
207
+ def generate_report(self, scores, model_name, output_dir, **kwargs) -> Report:
208
+ df = scores[self.default_subset]
209
+ report = ReportGenerator.gen_collection_report(df, self.name, model_name)
210
+ return report