evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,13 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import os
4
3
  import csv
4
+ import os
5
+
5
6
  from evalscope.benchmarks.data_adapter import DataAdapter
6
7
  from evalscope.metrics.metrics import exact_match, weighted_mean
7
8
  from evalscope.utils import ResponseParser, normalize_score
8
9
  from evalscope.utils.logger import get_logger
10
+
9
11
  # flake8: noqa
10
12
 
11
13
  logger = get_logger()
@@ -13,143 +15,89 @@ logger = get_logger()
13
15
  DATASET_ID = 'modelscope/cmmlu'
14
16
 
15
17
  SUBSET_LIST = [
16
- "agronomy",
17
- "anatomy",
18
- "ancient_chinese",
19
- "arts",
20
- "astronomy",
21
- "business_ethics",
22
- "chinese_civil_service_exam",
23
- "chinese_driving_rule",
24
- "chinese_food_culture",
25
- "chinese_foreign_policy",
26
- "chinese_history",
27
- "chinese_literature",
28
- "chinese_teacher_qualification",
29
- "college_actuarial_science",
30
- "college_education",
31
- "college_engineering_hydrology",
32
- "college_law",
33
- "college_mathematics",
34
- "college_medical_statistics",
35
- "clinical_knowledge",
36
- "college_medicine",
37
- "computer_science",
38
- "computer_security",
39
- "conceptual_physics",
40
- "construction_project_management",
41
- "economics",
42
- "education",
43
- "elementary_chinese",
44
- "elementary_commonsense",
45
- "elementary_information_and_technology",
46
- "electrical_engineering",
47
- "elementary_mathematics",
48
- "ethnology",
49
- "food_science",
50
- "genetics",
51
- "global_facts",
52
- "high_school_biology",
53
- "high_school_chemistry",
54
- "high_school_geography",
55
- "high_school_mathematics",
56
- "high_school_physics",
57
- "high_school_politics",
58
- "human_sexuality",
59
- "international_law",
60
- "journalism",
61
- "jurisprudence",
62
- "legal_and_moral_basis",
63
- "logical",
64
- "machine_learning",
65
- "management",
66
- "marketing",
67
- "marxist_theory",
68
- "modern_chinese",
69
- "nutrition",
70
- "philosophy",
71
- "professional_accounting",
72
- "professional_law",
73
- "professional_medicine",
74
- "professional_psychology",
75
- "public_relations",
76
- "security_study",
77
- "sociology",
78
- "sports_science",
79
- "traditional_chinese_medicine",
80
- "virology",
81
- "world_history",
82
- "world_religions"
18
+ 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
19
+ 'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
20
+ 'chinese_teacher_qualification', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology',
21
+ 'college_law', 'college_mathematics', 'college_medical_statistics', 'clinical_knowledge', 'college_medicine',
22
+ 'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics',
23
+ 'education', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
24
+ 'electrical_engineering', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
25
+ 'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics',
26
+ 'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism',
27
+ 'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
28
+ 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law',
29
+ 'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology',
30
+ 'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
83
31
  ]
84
32
 
85
-
86
- SUBJECT_MAPPING = {"agronomy": ["other", "Other"],
87
- "anatomy": ["biology", "STEM"],
88
- "ancient_chinese": ["china specific", "China specific"],
89
- "arts": ["arts", "Humanities"],
90
- "astronomy": ["physics", "STEM"],
91
- "business_ethics": ["business", "Social Science"],
92
- "chinese_civil_service_exam": ["china specific", "China specific"],
93
- "chinese_driving_rule": ["china specific", "China specific"],
94
- "chinese_food_culture": ["china specific", "China specific"],
95
- "chinese_foreign_policy": ["china specific", "China specific"],
96
- "chinese_history": ["china specific", "China specific"],
97
- "chinese_literature": ["china specific", "China specific"],
98
- "chinese_teacher_qualification": ["china specific", "China specific"],
99
- "college_actuarial_science": ["math", "STEM"],
100
- "college_education": ["education", "Social Science"],
101
- "college_engineering_hydrology": ["engineering", "STEM"],
102
- "college_law": ["law", "Humanities"],
103
- "college_mathematics": ["math", "STEM"],
104
- "college_medical_statistics": ["statistics", "STEM"],
105
- "clinical_knowledge": ["other", "Other"],
106
- "college_medicine": ["other", "Other"],
107
- "computer_science": ["computer science", "STEM"],
108
- "computer_security": ["other", "Other"],
109
- "conceptual_physics": ["physics", "STEM"],
110
- "construction_project_management": ["china specific", "China specific"],
111
- "economics": ["economics", "Social Science"],
112
- "education": ["education", "Social Science"],
113
- "elementary_chinese": ["china specific", "China specific"],
114
- "elementary_commonsense": ["china specific", "China specific"],
115
- "elementary_information_and_technology": ["other", "Other"],
116
- "electrical_engineering": ["engineering", "STEM"],
117
- "elementary_mathematics": ["math", "STEM"],
118
- "ethnology": ["china specific", "China specific"],
119
- "food_science": ["other", "Other"],
120
- "genetics": ["biology", "STEM"],
121
- "global_facts": ["global", "Humanities"],
122
- "high_school_biology": ["biology", "STEM"],
123
- "high_school_chemistry": ["chemistry", "STEM"],
124
- "high_school_geography": ["geography", "Social Science"],
125
- "high_school_mathematics": ["math", "STEM"],
126
- "high_school_physics": ["physics", "STEM"],
127
- "high_school_politics": ["china specific", "China specific"],
128
- "human_sexuality": ["other", "Other"],
129
- "international_law": ["law", "Humanities"],
130
- "journalism": ["sociology", "Social Science"],
131
- "jurisprudence": ["law", "Humanities"],
132
- "legal_and_moral_basis": ["other", "Other"],
133
- "logical": ["philosophy", "Humanities"],
134
- "machine_learning": ["computer science", "STEM"],
135
- "management": ["business", "Social Science"],
136
- "marketing": ["business", "Social Science"],
137
- "marxist_theory": ["philosophy", "Humanities"],
138
- "modern_chinese": ["china specific", "China specific"],
139
- "nutrition": ["other", "Other"],
140
- "philosophy": ["philosophy", "Humanities"],
141
- "professional_accounting": ["business", "Social Science"],
142
- "professional_law": ["law", "Humanities"],
143
- "professional_medicine": ["other", "Other"],
144
- "professional_psychology": ["psychology", "Social Science"],
145
- "public_relations": ["politics", "Social Science"],
146
- "security_study": ["politics", "Social Science"],
147
- "sociology": ["culture", "Social Science"],
148
- "sports_science": ["other", "Other"],
149
- "traditional_chinese_medicine": ["china specific", "China specific"],
150
- "virology": ["biology", "STEM"],
151
- "world_history": ["history", "Humanities"],
152
- "world_religions": ["global", "Humanities"]
33
+ SUBJECT_MAPPING = {
34
+ 'agronomy': ['other', 'Other'],
35
+ 'anatomy': ['biology', 'STEM'],
36
+ 'ancient_chinese': ['china specific', 'China specific'],
37
+ 'arts': ['arts', 'Humanities'],
38
+ 'astronomy': ['physics', 'STEM'],
39
+ 'business_ethics': ['business', 'Social Science'],
40
+ 'chinese_civil_service_exam': ['china specific', 'China specific'],
41
+ 'chinese_driving_rule': ['china specific', 'China specific'],
42
+ 'chinese_food_culture': ['china specific', 'China specific'],
43
+ 'chinese_foreign_policy': ['china specific', 'China specific'],
44
+ 'chinese_history': ['china specific', 'China specific'],
45
+ 'chinese_literature': ['china specific', 'China specific'],
46
+ 'chinese_teacher_qualification': ['china specific', 'China specific'],
47
+ 'college_actuarial_science': ['math', 'STEM'],
48
+ 'college_education': ['education', 'Social Science'],
49
+ 'college_engineering_hydrology': ['engineering', 'STEM'],
50
+ 'college_law': ['law', 'Humanities'],
51
+ 'college_mathematics': ['math', 'STEM'],
52
+ 'college_medical_statistics': ['statistics', 'STEM'],
53
+ 'clinical_knowledge': ['other', 'Other'],
54
+ 'college_medicine': ['other', 'Other'],
55
+ 'computer_science': ['computer science', 'STEM'],
56
+ 'computer_security': ['other', 'Other'],
57
+ 'conceptual_physics': ['physics', 'STEM'],
58
+ 'construction_project_management': ['china specific', 'China specific'],
59
+ 'economics': ['economics', 'Social Science'],
60
+ 'education': ['education', 'Social Science'],
61
+ 'elementary_chinese': ['china specific', 'China specific'],
62
+ 'elementary_commonsense': ['china specific', 'China specific'],
63
+ 'elementary_information_and_technology': ['other', 'Other'],
64
+ 'electrical_engineering': ['engineering', 'STEM'],
65
+ 'elementary_mathematics': ['math', 'STEM'],
66
+ 'ethnology': ['china specific', 'China specific'],
67
+ 'food_science': ['other', 'Other'],
68
+ 'genetics': ['biology', 'STEM'],
69
+ 'global_facts': ['global', 'Humanities'],
70
+ 'high_school_biology': ['biology', 'STEM'],
71
+ 'high_school_chemistry': ['chemistry', 'STEM'],
72
+ 'high_school_geography': ['geography', 'Social Science'],
73
+ 'high_school_mathematics': ['math', 'STEM'],
74
+ 'high_school_physics': ['physics', 'STEM'],
75
+ 'high_school_politics': ['china specific', 'China specific'],
76
+ 'human_sexuality': ['other', 'Other'],
77
+ 'international_law': ['law', 'Humanities'],
78
+ 'journalism': ['sociology', 'Social Science'],
79
+ 'jurisprudence': ['law', 'Humanities'],
80
+ 'legal_and_moral_basis': ['other', 'Other'],
81
+ 'logical': ['philosophy', 'Humanities'],
82
+ 'machine_learning': ['computer science', 'STEM'],
83
+ 'management': ['business', 'Social Science'],
84
+ 'marketing': ['business', 'Social Science'],
85
+ 'marxist_theory': ['philosophy', 'Humanities'],
86
+ 'modern_chinese': ['china specific', 'China specific'],
87
+ 'nutrition': ['other', 'Other'],
88
+ 'philosophy': ['philosophy', 'Humanities'],
89
+ 'professional_accounting': ['business', 'Social Science'],
90
+ 'professional_law': ['law', 'Humanities'],
91
+ 'professional_medicine': ['other', 'Other'],
92
+ 'professional_psychology': ['psychology', 'Social Science'],
93
+ 'public_relations': ['politics', 'Social Science'],
94
+ 'security_study': ['politics', 'Social Science'],
95
+ 'sociology': ['culture', 'Social Science'],
96
+ 'sports_science': ['other', 'Other'],
97
+ 'traditional_chinese_medicine': ['china specific', 'China specific'],
98
+ 'virology': ['biology', 'STEM'],
99
+ 'world_history': ['history', 'Humanities'],
100
+ 'world_religions': ['global', 'Humanities']
153
101
  }
154
102
 
155
103
 
@@ -171,12 +119,13 @@ class CMMLUAdapter(DataAdapter):
171
119
  if metric_list is None:
172
120
  metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
173
121
 
174
- super().__init__(subset_list=subset_list,
175
- metric_list=metric_list,
176
- few_shot_num=few_shot_num,
177
- train_split=train_split,
178
- eval_split=eval_split,
179
- **kwargs)
122
+ super().__init__(
123
+ subset_list=subset_list,
124
+ metric_list=metric_list,
125
+ few_shot_num=few_shot_num,
126
+ train_split=train_split,
127
+ eval_split=eval_split,
128
+ **kwargs)
180
129
 
181
130
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
182
131
  data_dict = {}
@@ -223,9 +172,7 @@ class CMMLUAdapter(DataAdapter):
223
172
  {'data': [(context, continuation), ...]}
224
173
 
225
174
  """
226
- prompt = '以下是关于{}的单项选择题。\n\n'.format(
227
- self._format_subject(subset_name)
228
- )
175
+ prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
229
176
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
230
177
 
231
178
  context: str = '\n'.join(few_shot_prompts) + '\n'
@@ -331,17 +278,24 @@ class CMMLUAdapter(DataAdapter):
331
278
  domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
332
279
  sum([num for _, _, num in domain_res_list])
333
280
  domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
334
- category_list.append({'name': domain_name,
335
- 'score': domain_weighted_avg_acc,
336
- 'subset': [{'name': subset_name, 'score': normalize_score(subset_score)}
337
- for subset_name, subset_score, _ in domain_res_list]})
281
+ category_list.append({
282
+ 'name':
283
+ domain_name,
284
+ 'score':
285
+ domain_weighted_avg_acc,
286
+ 'subset': [{
287
+ 'name': subset_name,
288
+ 'score': normalize_score(subset_score)
289
+ } for subset_name, subset_score, _ in domain_res_list]
290
+ })
338
291
 
339
292
  # Get final dict of report
340
- res_map = dict(name=report_name or 'cmmlu',
341
- metric=self.metric_list[0]['name'],
342
- score=weighted_avg_acc,
343
- category=category_list,
344
- total_num=total_num)
293
+ res_map = dict(
294
+ name=report_name or 'cmmlu',
295
+ metric=self.metric_list[0]['name'],
296
+ score=weighted_avg_acc,
297
+ category=category_list,
298
+ total_num=total_num)
345
299
 
346
300
  return res_map
347
301
 
@@ -366,4 +320,4 @@ class CMMLUAdapter(DataAdapter):
366
320
  s = ''
367
321
  for entry in l:
368
322
  s += ' ' + entry
369
- return s
323
+ return s
@@ -2,4 +2,4 @@
2
2
  {'input': '下列关于重力的说法正确的是', 'A': '在地球周围的物体都要受到重力作用,与其运动状态无关', 'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变', 'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下', 'D': '在地球表面各处的重力方向都是相同的', 'target': 'A'}
3
3
  {'input': '心脏的静脉血回心的主要途径是', 'A': '心小静脉', 'B': '冠状窦', 'C': '心中静脉', 'D': '心前静脉', 'target': 'B'}
4
4
  {'input': "以西蒙为代表的决策理论学派提出的决策准则是", 'A': '最优化', 'B': '公平', 'C': '民主化', 'D': '满意', 'target': 'D'}
5
- {'input': '20世纪初,英国首相阿斯奎斯说:“我们现在有一个牢固确立了两百年的传统,即归根到底,王位的占有者接受其大臣的建议并据此行事。”这一传统的确立,使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家,成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
5
+ {'input': '20世纪初,英国首相阿斯奎斯说:“我们现在有一个牢固确立了两百年的传统,即归根到底,王位的占有者接受其大臣的建议并据此行事。”这一传统的确立,使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家,成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter, DATASET_ID, SUBSET_LIST
3
+ from evalscope.benchmarks.competition_math.competition_math_adapter import DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter
4
5
  from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
6
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -1,13 +1,10 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
2
  """Mathematics Aptitude Test of Heuristics (MATH) dataset."""
4
3
 
4
+ import datasets
5
5
  import json
6
6
  import os
7
7
 
8
- import datasets
9
-
10
-
11
8
  _CITATION = """\
12
9
  @article{hendrycksmath2021,
13
10
  title={Measuring Mathematical Problem Solving With the MATH Dataset},
@@ -24,7 +21,6 @@ _CITATION = """\
24
21
  }
25
22
  """
26
23
 
27
-
28
24
  _DESCRIPTION = """\
29
25
  The Mathematics Aptitude Test of Heuristics (MATH) dataset consists of problems
30
26
  from mathematics competitions, including the AMC 10, AMC 12, AIME, and more.
@@ -32,13 +28,10 @@ Each problem in MATH has a full step-by-step solution, which can be used to teac
32
28
  models to generate answer derivations and explanations.
33
29
  """
34
30
 
35
-
36
31
  _HOMEPAGE = 'https://github.com/hendrycks/math'
37
32
 
38
-
39
33
  _LICENSE = 'https://github.com/hendrycks/math/blob/main/LICENSE'
40
34
 
41
-
42
35
  # Original data URL: "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
43
36
  _URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/math/MATH.zip'
44
37
 
@@ -49,14 +42,12 @@ class CompetitionMathDataset(datasets.GeneratorBasedBuilder):
49
42
  VERSION = datasets.Version('1.0.0')
50
43
 
51
44
  def _info(self):
52
- features = datasets.Features(
53
- {
54
- 'problem': datasets.Value('string'),
55
- 'level': datasets.Value('string'),
56
- 'type': datasets.Value('string'),
57
- 'solution': datasets.Value('string'),
58
- }
59
- )
45
+ features = datasets.Features({
46
+ 'problem': datasets.Value('string'),
47
+ 'level': datasets.Value('string'),
48
+ 'type': datasets.Value('string'),
49
+ 'solution': datasets.Value('string'),
50
+ })
60
51
  return datasets.DatasetInfo(
61
52
  description=_DESCRIPTION,
62
53
  features=features,
@@ -8,6 +8,7 @@ from evalscope.benchmarks import DataAdapter
8
8
  from evalscope.metrics.metrics import weighted_mean
9
9
  from evalscope.utils import normalize_score
10
10
  from evalscope.utils.logger import get_logger
11
+
11
12
  # flake8: noqa
12
13
 
13
14
  logger = get_logger()
@@ -43,12 +44,13 @@ class CompetitionMathAdapter(DataAdapter):
43
44
  f'but got {self.few_shot_num}. Use 4-shot by default.')
44
45
  few_shot_num = 4
45
46
 
46
- super().__init__(subset_list=subset_list,
47
- metric_list=metric_list,
48
- few_shot_num=few_shot_num,
49
- train_split=train_split,
50
- eval_split=eval_split,
51
- **kwargs)
47
+ super().__init__(
48
+ subset_list=subset_list,
49
+ metric_list=metric_list,
50
+ few_shot_num=few_shot_num,
51
+ train_split=train_split,
52
+ eval_split=eval_split,
53
+ **kwargs)
52
54
 
53
55
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
54
56
  data_dict: dict = {}
@@ -161,17 +163,19 @@ class CompetitionMathAdapter(DataAdapter):
161
163
  total_num: int = sum([num for _, num in subset_score_map.values()])
162
164
  weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
163
165
  weighted_avg_acc = normalize_score(score=weighted_avg_acc)
164
- cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
166
+ cate_avg_list = [{
167
+ 'name': subset_name,
168
+ 'score': normalize_score(score=score)
169
+ } for subset_name, (score, _) in subset_score_map.items()]
165
170
 
166
- category_d = dict(name='DEFAULT',
167
- score=weighted_avg_acc,
168
- subset=cate_avg_list)
171
+ category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
169
172
 
170
- res_map = dict(name=report_name or 'competition_math',
171
- metric=self.metric_list[0]['name'],
172
- score=weighted_avg_acc,
173
- category=[category_d],
174
- total_num=total_num)
173
+ res_map = dict(
174
+ name=report_name or 'competition_math',
175
+ metric=self.metric_list[0]['name'],
176
+ score=weighted_avg_acc,
177
+ category=[category_d],
178
+ total_num=total_num)
175
179
 
176
180
  return res_map
177
181
 
@@ -186,8 +190,7 @@ class CompetitionMathAdapter(DataAdapter):
186
190
  'Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:\nWe have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'
187
191
  'Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:\nIf Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'
188
192
  'Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:\nIf we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'
189
- f'Problem:\n{problem}\nSolution:\n'
190
- )
193
+ f'Problem:\n{problem}\nSolution:\n')
191
194
  else:
192
195
  context = 'Problem:\n' + problem + '\nSolution:\n'
193
196
  return context
@@ -212,15 +215,15 @@ class CompetitionMathAdapter(DataAdapter):
212
215
 
213
216
  if '\\boxed ' in s:
214
217
  left = '\\boxed '
215
- assert s[: len(left)] == left
218
+ assert s[:len(left)] == left
216
219
  return s[len(left):]
217
220
 
218
221
  left = '\\boxed{'
219
222
 
220
- assert s[: len(left)] == left
223
+ assert s[:len(left)] == left
221
224
  assert s[-1] == '}'
222
225
 
223
- return s[len(left): -1]
226
+ return s[len(left):-1]
224
227
 
225
228
  @classmethod
226
229
  def _last_boxed_only_string(cls, string):
@@ -249,7 +252,7 @@ class CompetitionMathAdapter(DataAdapter):
249
252
  if right_brace_idx is None:
250
253
  retval = None
251
254
  else:
252
- retval = string[idx: right_brace_idx + 1]
255
+ retval = string[idx:right_brace_idx + 1]
253
256
 
254
257
  return retval
255
258
 
@@ -409,18 +412,14 @@ class CompetitionMathAdapter(DataAdapter):
409
412
 
410
413
  @classmethod
411
414
  def _math_postprocess(cls, text: str) -> str:
412
- SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''),
413
- (r'\ ', ''), (' ', ''), ('mbox', 'text'),
414
- (',\\text{and}', ','), ('\\text{and}', ','),
415
- ('\\text{m}', '\\text{}'), ('\\le', '<')]
415
+ SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''), (r'\ ', ''), (' ', ''), ('mbox', 'text'),
416
+ (',\\text{and}', ','), ('\\text{and}', ','), ('\\text{m}', '\\text{}'), ('\\le', '<')]
416
417
  REMOVED_EXPRESSIONS = [
417
- 'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft',
418
- 'hours', 'km', 'units', '\\ldots', 'sue', 'points', 'feet', 'minutes',
419
- 'digits', 'cents', 'degrees', 'cm', 'gm', 'pounds', 'meters', 'meals',
420
- 'edges', 'students', 'childrentickets', 'multiples', '\\text{s}',
421
- '\\text{.}', '\\text{\ns}', '\\text{}^2', '\\text{}^3', '\\text{\n}',
422
- '\\text{}', r'\mathrm{th}', r'^\circ', r'^{\circ}', r'\;', r',\!',
423
- '{,}', '"', '\\dots', '\n', '\r', '\f'
418
+ 'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft', 'hours', 'km', 'units', '\\ldots', 'sue',
419
+ 'points', 'feet', 'minutes', 'digits', 'cents', 'degrees', 'cm', 'gm', 'pounds', 'meters', 'meals', 'edges',
420
+ 'students', 'childrentickets', 'multiples', '\\text{s}', '\\text{.}', '\\text{\ns}', '\\text{}^2',
421
+ '\\text{}^3', '\\text{\n}', '\\text{}', r'\mathrm{th}', r'^\circ', r'^{\circ}', r'\;', r',\!', '{,}', '"',
422
+ '\\dots', '\n', '\r', '\f'
424
423
  ]
425
424
  import re
426
425
 
@@ -453,8 +452,7 @@ class CompetitionMathAdapter(DataAdapter):
453
452
  if 'rac' in final_answer and '\\frac' not in final_answer:
454
453
  final_answer = final_answer.replace('rac', '\\frac')
455
454
 
456
- final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}',
457
- final_answer)
455
+ final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}', final_answer)
458
456
  final_answer = re.sub(r'(sqrt)([^{])', 'sqrt{\\2}', final_answer)
459
457
  final_answer = final_answer.replace('$', '')
460
458